diff --git a/.gitattributes b/.gitattributes index 326d5c1057d550e9dc9809bceaeb3d15f4f63dd7..1b4f67369cec14f19435e399cbed83a44b3a5c02 100644 --- a/.gitattributes +++ b/.gitattributes @@ -210,3 +210,63 @@ v127rc_exp2/B_mul/checkpoint-9500/tokenizer.json filter=lfs diff=lfs merge=lfs - v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2639aecc03a9e61a79818c0c50f08c4eb4eb6c53 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.1 + e: + mfjy22anxcucsb3vwlaimrwvqrgvipis: + args: + - /workspace/v127rc_exp1/C.yaml + cpu_count: 16 + cpu_count_logical: 32 + cudaVersion: "13.0" + disk: + /: + total: "21474836480" + used: "1858306048" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de + host: 47a53adf0198 + memory: + total: "201701408768" + os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-04T03:57:46.163443Z" + writerId: mfjy22anxcucsb3vwlaimrwvqrgvipis + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.1 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t0_d35_r286 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/C +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - up_proj + - q_proj + - gate_proj + - k_proj + - v_proj + - o_proj + - down_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 1000 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..24c994c6cdaee26cfd916f3e66cb7b2685b4d84a --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log @@ -0,0 +1,423 @@ + 0%| | 0/18595 [00:00 + sys.exit(main()) + ^^^^^^ + File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main + launcher.launch() + File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch + run_exp() + File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp + _training_function(config={"args": args, "callbacks": callbacks}) + File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function + run_pt(model_args, data_args, training_args, finetuning_args, callbacks) + File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train + return inner_training_loop( + ^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs, num_items_in_batch) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step + self.accelerator.backward(loss, **kwargs) + File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward + loss.backward(**kwargs) + File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward + torch.autograd.backward( + File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward + _engine_run_backward( + File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +KeyboardInterrupt diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..389c7ce87281fc3d1c59e3b6354de1275d6abc28 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-04T03:57:46.163443Z", + "args": [ + "/workspace/v127rc_exp1/C.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "47a53adf0198", + "executable": "/usr/bin/python", + "cpu_count": 16, + "cpu_count_logical": 32, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "1858306048" + } + }, + "memory": { + "total": "201701408768" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de" + } + ], + "cudaVersion": "13.0", + "writerId": "mfjy22anxcucsb3vwlaimrwvqrgvipis" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0af17f7f3e34899a7287a6233fffc92a8636c473 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/global_step":388,"train/grad_norm":2.0090420246124268,"train/learning_rate":4.9999916410392856e-05,"_wandb":{"runtime":396},"_runtime":396,"train/loss":0.8193472027778625,"_step":387,"train/epoch":0.1043291207313794,"train_runtime":396.6553,"train/train_tokens_per_second":2002.333,"_timestamp":1.770177862347725e+09,"train/num_input_tokens_seen":794236} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log b/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e18c7aa14d80fde45c3488eefc55493b29cb1c42 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log @@ -0,0 +1,299 @@ + 0%| | 0/40950 [00:00', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-04 04:03:33,192 INFO MainThread:7849 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-04 04:03:33,193 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-04 04:03:33,195 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t35_d0_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..320a1ee17a0e0e624288dee0df1ed72fc3107550 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.1 + e: + jy6in5azojamixlag12ky8yqk0a5luc8: + args: + - /workspace/v127rc_exp1/C.yaml + cpu_count: 16 + cpu_count_logical: 32 + cudaVersion: "13.0" + disk: + /: + total: "21474836480" + used: "1858318336" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de + host: 47a53adf0198 + memory: + total: "201701408768" + os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-04T04:05:44.037622Z" + writerId: jy6in5azojamixlag12ky8yqk0a5luc8 + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.1 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t0_d35_r286 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/C +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - o_proj + - down_proj + - gate_proj + - v_proj + - k_proj + - q_proj + - up_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 266 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7016f07bb0d7c82e394712c2eaa661015062ca33 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log @@ -0,0 +1,191 @@ + 0%| | 0/18595 [00:00 + sys.exit(main()) + ^^^^^^ + File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main + launcher.launch() + File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch + run_exp() + File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp + _training_function(config={"args": args, "callbacks": callbacks}) + File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function + run_pt(model_args, data_args, training_args, finetuning_args, callbacks) + File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train + return inner_training_loop( + ^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs, num_items_in_batch) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step + self.accelerator.backward(loss, **kwargs) + File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward + loss.backward(**kwargs) + File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward + torch.autograd.backward( + File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward + _engine_run_backward( + File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward + return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +KeyboardInterrupt diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..250b7ca7baef222ee78261c629b347d5f4fe7859 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.3.7 +fastapi==0.128.0 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.51.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.1 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..337d59c3a45d0dd20588e7676c6d7a8cfb199e39 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-04T04:05:44.037622Z", + "args": [ + "/workspace/v127rc_exp1/C.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "47a53adf0198", + "executable": "/usr/bin/python", + "cpu_count": 16, + "cpu_count_logical": 32, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "1858318336" + } + }, + "memory": { + "total": "201701408768" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de" + } + ], + "cudaVersion": "13.0", + "writerId": "jy6in5azojamixlag12ky8yqk0a5luc8" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b3bbe83a9392f54989b42f524562f347b9249ad1 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":159,"_timestamp":1.770178104014671e+09,"train/grad_norm":0.7178835272789001,"_wandb":{"runtime":159},"train/train_tokens_per_second":1990.521,"train/num_input_tokens_seen":319332,"train/global_step":156,"train/epoch":0.041946759881688625,"train_runtime":160.4264,"train/loss":1.4694324731826782,"train/learning_rate":2.0833333333333336e-05,"_step":155} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..335ebd33ab58381e577957ad2d75c658050402f9 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2026-02-04T04:05:44.28893781Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-04T04:05:44.666073338Z","level":"INFO","msg":"stream: created new stream","id":"nj0w4q6e"} +{"time":"2026-02-04T04:05:44.666543269Z","level":"INFO","msg":"handler: started","stream_id":"nj0w4q6e"} +{"time":"2026-02-04T04:05:44.668183448Z","level":"INFO","msg":"stream: started","id":"nj0w4q6e"} +{"time":"2026-02-04T04:05:44.668196893Z","level":"INFO","msg":"writer: started","stream_id":"nj0w4q6e"} +{"time":"2026-02-04T04:05:44.668198065Z","level":"INFO","msg":"sender: started","stream_id":"nj0w4q6e"} +{"time":"2026-02-04T04:08:24.969216421Z","level":"INFO","msg":"stream: closing","id":"nj0w4q6e"} +{"time":"2026-02-04T04:08:25.578748227Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-04T04:08:25.833732236Z","level":"INFO","msg":"handler: closed","stream_id":"nj0w4q6e"} +{"time":"2026-02-04T04:08:25.837480922Z","level":"INFO","msg":"sender: closed","stream_id":"nj0w4q6e"} +{"time":"2026-02-04T04:08:25.837821633Z","level":"INFO","msg":"stream: closed","id":"nj0w4q6e"} diff --git a/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..17e8365f001f44d715d1e6f34888f767c029791c --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log @@ -0,0 +1,25 @@ +2026-02-04 04:05:44,065 INFO MainThread:6386 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-04 04:05:44,065 INFO MainThread:6386 [wandb_setup.py:_flush():81] Configure stats pid to 6386 +2026-02-04 04:05:44,066 INFO MainThread:6386 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-04 04:05:44,066 INFO MainThread:6386 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log +2026-02-04 04:05:44,067 INFO MainThread:6386 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log +2026-02-04 04:05:44,067 INFO MainThread:6386 [wandb_init.py:init():844] calling init triggers +2026-02-04 04:05:44,068 INFO MainThread:6386 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-04 04:05:44,068 INFO MainThread:6386 [wandb_init.py:init():892] starting backend +2026-02-04 04:05:44,278 INFO MainThread:6386 [wandb_init.py:init():895] sending inform_init request +2026-02-04 04:05:44,286 INFO MainThread:6386 [wandb_init.py:init():903] backend started and connected +2026-02-04 04:05:44,288 INFO MainThread:6386 [wandb_init.py:init():973] updated telemetry +2026-02-04 04:05:44,352 INFO MainThread:6386 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-04 04:05:44,992 INFO MainThread:6386 [wandb_init.py:init():1042] starting run threads in backend +2026-02-04 04:05:45,060 INFO MainThread:6386 [wandb_run.py:_console_start():2529] atexit reg +2026-02-04 04:05:45,060 INFO MainThread:6386 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-04 04:05:45,061 INFO MainThread:6386 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-04 04:05:45,061 INFO MainThread:6386 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-04 04:05:45,063 INFO MainThread:6386 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-04 04:05:45,064 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'down_proj', 'gate_proj', 'v_proj', 'k_proj', 'q_proj', 'up_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 266, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-04 04:05:45,071 INFO MainThread:6386 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-04 04:05:45,071 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-04 04:05:45,073 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2026-02-04 04:08:24,969 INFO wandb-AsyncioManager-main:6386 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-04 04:08:24,970 INFO wandb-AsyncioManager-main:6386 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc26945e63952da6d31cf815448439ce321aacde --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.1 + e: + dq2kg12neczzbdsqmciypnior6fee84h: + args: + - /workspace/v127rc_exp1/B_dup.yaml + cpu_count: 16 + cpu_count_logical: 32 + cudaVersion: "12.7" + disk: + /: + total: "21474836480" + used: "2193969152" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1 + host: e5c6872797ac + memory: + total: "201701502976" + os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-04T08:35:48.570855Z" + writerId: dq2kg12neczzbdsqmciypnior6fee84h + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.1 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t0_d35_r286 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/B_dup +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - o_proj + - gate_proj + - k_proj + - up_proj + - v_proj + - q_proj + - down_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 1000 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..250b7ca7baef222ee78261c629b347d5f4fe7859 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.3.7 +fastapi==0.128.0 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.51.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.1 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2a1bac9d761f8d8ffe7febc6ab3a95c215d8a10a --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-04T08:35:48.570855Z", + "args": [ + "/workspace/v127rc_exp1/B_dup.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "e5c6872797ac", + "executable": "/usr/bin/python", + "cpu_count": 16, + "cpu_count_logical": 32, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2193969152" + } + }, + "memory": { + "total": "201701502976" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1" + } + ], + "cudaVersion": "12.7", + "writerId": "dq2kg12neczzbdsqmciypnior6fee84h" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e041b6a0d5fc30408f450783ca74c19918c92128 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/grad_norm":0.2597666084766388,"_step":73480,"train_samples_per_second":0.975,"_runtime":75384,"train/epoch":5,"_wandb":{"runtime":75384},"train/num_input_tokens_seen":150413560,"train/train_tokens_per_second":1995.358,"train/loss":0.014940977096557617,"train_steps_per_second":0.975,"_timestamp":1.7702695315018873e+09,"total_flos":6.869735474541773e+18,"train/learning_rate":2.379162700183457e-14,"train_loss":0.08730816244039097,"train_runtime":75383.3694,"train/global_step":73480} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..7a8e7f5affa5fd7bb2af8f508936f51c60a2b320 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log @@ -0,0 +1,14 @@ +{"time":"2026-02-04T08:35:48.826256258Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-04T08:35:49.141746844Z","level":"INFO","msg":"stream: created new stream","id":"pwixiyan"} +{"time":"2026-02-04T08:35:49.142115089Z","level":"INFO","msg":"handler: started","stream_id":"pwixiyan"} +{"time":"2026-02-04T08:35:49.143583725Z","level":"INFO","msg":"stream: started","id":"pwixiyan"} +{"time":"2026-02-04T08:35:49.143601157Z","level":"INFO","msg":"writer: started","stream_id":"pwixiyan"} +{"time":"2026-02-04T08:35:49.14359757Z","level":"INFO","msg":"sender: started","stream_id":"pwixiyan"} +{"time":"2026-02-04T17:47:19.818024452Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-04T18:31:07.413320842Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-04T22:59:10.135922468Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-05T05:32:13.77134292Z","level":"INFO","msg":"stream: closing","id":"pwixiyan"} +{"time":"2026-02-05T05:32:15.653703901Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-05T05:32:15.875179968Z","level":"INFO","msg":"handler: closed","stream_id":"pwixiyan"} +{"time":"2026-02-05T05:32:15.87824593Z","level":"INFO","msg":"sender: closed","stream_id":"pwixiyan"} +{"time":"2026-02-05T05:32:15.878535169Z","level":"INFO","msg":"stream: closed","id":"pwixiyan"} diff --git a/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4431bb42a9e9fc385a0a6c769cc75a9ada53d3ca --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log @@ -0,0 +1,25 @@ +2026-02-04 08:35:48,588 INFO MainThread:3069 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-04 08:35:48,588 INFO MainThread:3069 [wandb_setup.py:_flush():81] Configure stats pid to 3069 +2026-02-04 08:35:48,589 INFO MainThread:3069 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-04 08:35:48,589 INFO MainThread:3069 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log +2026-02-04 08:35:48,590 INFO MainThread:3069 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log +2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():844] calling init triggers +2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():892] starting backend +2026-02-04 08:35:48,817 INFO MainThread:3069 [wandb_init.py:init():895] sending inform_init request +2026-02-04 08:35:48,824 INFO MainThread:3069 [wandb_init.py:init():903] backend started and connected +2026-02-04 08:35:48,825 INFO MainThread:3069 [wandb_init.py:init():973] updated telemetry +2026-02-04 08:35:48,867 INFO MainThread:3069 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-04 08:35:49,594 INFO MainThread:3069 [wandb_init.py:init():1042] starting run threads in backend +2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_console_start():2529] atexit reg +2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-04 08:35:49,663 INFO MainThread:3069 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-04 08:35:49,664 INFO MainThread:3069 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-04 08:35:49,666 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'gate_proj', 'k_proj', 'up_proj', 'v_proj', 'q_proj', 'down_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-04 08:35:49,672 INFO MainThread:3069 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-04 08:35:49,672 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-04 08:35:49,674 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2026-02-05 05:32:13,771 INFO wandb-AsyncioManager-main:3069 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-05 05:32:13,771 INFO wandb-AsyncioManager-main:3069 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e11a35055f499c7923a741d7a870081af6c630c9 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.1 + e: + ymezb35dmjxj99q0ikd0taef6he5rsbn: + args: + - /workspace/v127rc_exp1/D_dup.yaml + cpu_count: 24 + cpu_count_logical: 48 + cudaVersion: "12.8" + disk: + /: + total: "21474836480" + used: "2203967488" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a + host: 313b3f58db2c + memory: + total: "270100414464" + os: Linux-6.8.0-78-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-04T08:56:16.046521Z" + writerId: ymezb35dmjxj99q0ikd0taef6he5rsbn + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.1 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t0_d100_r101 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/D_dup +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - down_proj + - k_proj + - up_proj + - gate_proj + - o_proj + - q_proj + - v_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 1000 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..250b7ca7baef222ee78261c629b347d5f4fe7859 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.3.7 +fastapi==0.128.0 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.51.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.1 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..873c123310ac6aa9c9694e7c6b6eeaa0c85f7381 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-78-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-04T08:56:16.046521Z", + "args": [ + "/workspace/v127rc_exp1/D_dup.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "313b3f58db2c", + "executable": "/usr/bin/python", + "cpu_count": 24, + "cpu_count_logical": 48, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2203967488" + } + }, + "memory": { + "total": "270100414464" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a" + } + ], + "cudaVersion": "12.8", + "writerId": "ymezb35dmjxj99q0ikd0taef6he5rsbn" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..19b5f90474680d623dcbca94f1e2abc421494636 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json @@ -0,0 +1 @@ +{"total_flos":7.007635036666829e+18,"_wandb":{"runtime":79122},"train/grad_norm":0.20166438817977905,"train_runtime":79119.4798,"_timestamp":1.7702744950489569e+09,"train/learning_rate":2.2864779514186752e-14,"_step":74955,"train_steps_per_second":0.947,"train/global_step":74955,"train/train_tokens_per_second":1939.332,"train_loss":0.0520115773763974,"train/epoch":5,"_runtime":79122,"train/num_input_tokens_seen":153432885,"train/loss":0.013762388378381729,"train_samples_per_second":0.947} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..01b20feb044d7a227672b108bda1ebf77f0b9bf0 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log @@ -0,0 +1,14 @@ +{"time":"2026-02-04T08:56:16.334273741Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-04T08:56:16.719436268Z","level":"INFO","msg":"stream: created new stream","id":"pnh57y4w"} +{"time":"2026-02-04T08:56:16.720193488Z","level":"INFO","msg":"handler: started","stream_id":"pnh57y4w"} +{"time":"2026-02-04T08:56:16.722437346Z","level":"INFO","msg":"stream: started","id":"pnh57y4w"} +{"time":"2026-02-04T08:56:16.722511208Z","level":"INFO","msg":"sender: started","stream_id":"pnh57y4w"} +{"time":"2026-02-04T08:56:16.722517428Z","level":"INFO","msg":"writer: started","stream_id":"pnh57y4w"} +{"time":"2026-02-04T18:51:17.561552143Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-04T21:10:50.641448939Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-04T21:51:53.27313763Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-05T06:54:59.294785648Z","level":"INFO","msg":"stream: closing","id":"pnh57y4w"} +{"time":"2026-02-05T06:55:01.38735749Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-05T06:55:01.616258321Z","level":"INFO","msg":"handler: closed","stream_id":"pnh57y4w"} +{"time":"2026-02-05T06:55:01.620481643Z","level":"INFO","msg":"sender: closed","stream_id":"pnh57y4w"} +{"time":"2026-02-05T06:55:01.620880145Z","level":"INFO","msg":"stream: closed","id":"pnh57y4w"} diff --git a/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f904a8edc01f34d3fcc423aff05c7dc85cd307bc --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log @@ -0,0 +1,25 @@ +2026-02-04 08:56:16,078 INFO MainThread:439 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-04 08:56:16,079 INFO MainThread:439 [wandb_setup.py:_flush():81] Configure stats pid to 439 +2026-02-04 08:56:16,080 INFO MainThread:439 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-04 08:56:16,080 INFO MainThread:439 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log +2026-02-04 08:56:16,081 INFO MainThread:439 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log +2026-02-04 08:56:16,082 INFO MainThread:439 [wandb_init.py:init():844] calling init triggers +2026-02-04 08:56:16,083 INFO MainThread:439 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-04 08:56:16,083 INFO MainThread:439 [wandb_init.py:init():892] starting backend +2026-02-04 08:56:16,317 INFO MainThread:439 [wandb_init.py:init():895] sending inform_init request +2026-02-04 08:56:16,328 INFO MainThread:439 [wandb_init.py:init():903] backend started and connected +2026-02-04 08:56:16,331 INFO MainThread:439 [wandb_init.py:init():973] updated telemetry +2026-02-04 08:56:16,409 INFO MainThread:439 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-04 08:56:17,188 INFO MainThread:439 [wandb_init.py:init():1042] starting run threads in backend +2026-02-04 08:56:17,388 INFO MainThread:439 [wandb_run.py:_console_start():2529] atexit reg +2026-02-04 08:56:17,389 INFO MainThread:439 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-04 08:56:17,389 INFO MainThread:439 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-04 08:56:17,390 INFO MainThread:439 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-04 08:56:17,393 INFO MainThread:439 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-04 08:56:17,395 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['down_proj', 'k_proj', 'up_proj', 'gate_proj', 'o_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-04 08:56:17,406 INFO MainThread:439 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-04 08:56:17,406 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-04 08:56:17,410 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d100_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2026-02-05 06:54:59,294 INFO wandb-AsyncioManager-main:439 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-05 06:54:59,296 INFO wandb-AsyncioManager-main:439 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e24273f89f7afe1fd637acdaddb7197b4e3ab114 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.1 + e: + mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc: + args: + - /workspace/v127rc_exp1/C_dup.yaml + cpu_count: 16 + cpu_count_logical: 32 + cudaVersion: "12.8" + disk: + /: + total: "21474836480" + used: "2197102592" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-518d5b06-9437-a74a-eed0-11812394bafa + host: dbefea6e926e + memory: + total: "132536217600" + os: Linux-6.8.0-88-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-04T09:03:20.733865Z" + writerId: mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.1 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t0_d70_r143 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/C_dup +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - k_proj + - o_proj + - q_proj + - gate_proj + - up_proj + - down_proj + - v_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 1000 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..250b7ca7baef222ee78261c629b347d5f4fe7859 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.3.7 +fastapi==0.128.0 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.51.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.1 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1b4839a670ed1dfa5060a4bd9554f43901d6de1e --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-88-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-04T09:03:20.733865Z", + "args": [ + "/workspace/v127rc_exp1/C_dup.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "dbefea6e926e", + "executable": "/usr/bin/python", + "cpu_count": 16, + "cpu_count_logical": 32, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2197102592" + } + }, + "memory": { + "total": "132536217600" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-518d5b06-9437-a74a-eed0-11812394bafa" + } + ], + "cudaVersion": "12.8", + "writerId": "mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..58debfe6c61549f485b9002f3e9283d51a64b2b6 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json @@ -0,0 +1 @@ +{"train_runtime":76057.1863,"_runtime":76057,"train_loss":0.05950206121845679,"train/grad_norm":0.08892247080802917,"train/epoch":5,"train_steps_per_second":0.973,"train/learning_rate":2.343619187605839e-14,"train/train_tokens_per_second":1992.607,"_timestamp":1.7702718574597487e+09,"_step":74035,"total_flos":6.921623106392218e+18,"train_samples_per_second":0.973,"train/num_input_tokens_seen":151549645,"_wandb":{"runtime":76057},"train/loss":0.01741047017276287,"train/global_step":74035} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a0c33c35c2b87d2f872563784ca746d92dd363dd --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log @@ -0,0 +1,13 @@ +{"time":"2026-02-04T09:03:20.972443735Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-04T09:03:21.325948046Z","level":"INFO","msg":"stream: created new stream","id":"aseg728n"} +{"time":"2026-02-04T09:03:21.326834454Z","level":"INFO","msg":"handler: started","stream_id":"aseg728n"} +{"time":"2026-02-04T09:03:21.328230927Z","level":"INFO","msg":"stream: started","id":"aseg728n"} +{"time":"2026-02-04T09:03:21.328245133Z","level":"INFO","msg":"sender: started","stream_id":"aseg728n"} +{"time":"2026-02-04T09:03:21.32824351Z","level":"INFO","msg":"writer: started","stream_id":"aseg728n"} +{"time":"2026-02-04T19:00:37.019618501Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-04T19:04:09.622196123Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-05T06:10:59.110706011Z","level":"INFO","msg":"stream: closing","id":"aseg728n"} +{"time":"2026-02-05T06:11:01.208766135Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-05T06:11:01.529632193Z","level":"INFO","msg":"handler: closed","stream_id":"aseg728n"} +{"time":"2026-02-05T06:11:01.532583178Z","level":"INFO","msg":"sender: closed","stream_id":"aseg728n"} +{"time":"2026-02-05T06:11:01.53279222Z","level":"INFO","msg":"stream: closed","id":"aseg728n"} diff --git a/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log b/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..cae7c9de8c591dd26c1602169f9096fd801852ba --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log @@ -0,0 +1,25 @@ +2026-02-04 09:03:20,750 INFO MainThread:2574 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-04 09:03:20,750 INFO MainThread:2574 [wandb_setup.py:_flush():81] Configure stats pid to 2574 +2026-02-04 09:03:20,751 INFO MainThread:2574 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-04 09:03:20,751 INFO MainThread:2574 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log +2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log +2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:init():844] calling init triggers +2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-04 09:03:20,753 INFO MainThread:2574 [wandb_init.py:init():892] starting backend +2026-02-04 09:03:20,966 INFO MainThread:2574 [wandb_init.py:init():895] sending inform_init request +2026-02-04 09:03:20,971 INFO MainThread:2574 [wandb_init.py:init():903] backend started and connected +2026-02-04 09:03:20,973 INFO MainThread:2574 [wandb_init.py:init():973] updated telemetry +2026-02-04 09:03:21,024 INFO MainThread:2574 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-04 09:03:21,802 INFO MainThread:2574 [wandb_init.py:init():1042] starting run threads in backend +2026-02-04 09:03:21,866 INFO MainThread:2574 [wandb_run.py:_console_start():2529] atexit reg +2026-02-04 09:03:21,866 INFO MainThread:2574 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-04 09:03:21,867 INFO MainThread:2574 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-04 09:03:21,867 INFO MainThread:2574 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-04 09:03:21,869 INFO MainThread:2574 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-04 09:03:21,870 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['k_proj', 'o_proj', 'q_proj', 'gate_proj', 'up_proj', 'down_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-04 09:03:21,876 INFO MainThread:2574 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-04 09:03:21,877 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-04 09:03:21,879 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d70_r143'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2026-02-05 06:10:59,110 INFO wandb-AsyncioManager-main:2574 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-05 06:10:59,111 INFO wandb-AsyncioManager-main:2574 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..692eda17d9149b27a97b83d476e97ed038466155 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.1 + e: + km795qg4wugx2xk47glqbs7x5abb2ilt: + args: + - /workspace/v127rc_exp1/E_dup.yaml + cpu_count: 16 + cpu_count_logical: 32 + cudaVersion: "12.9" + disk: + /: + total: "21474836480" + used: "2198335488" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072 + host: 9acfbb3ac08f + memory: + total: "134123917312" + os: Linux-6.8.0-64-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-04T09:03:21.035088Z" + writerId: km795qg4wugx2xk47glqbs7x5abb2ilt + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.1 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t0_d119_r85 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/E_dup +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - up_proj + - q_proj + - k_proj + - down_proj + - gate_proj + - o_proj + - v_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 1000 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fc76d6e0f779ad8d3f8928c0578de76fc37ebb5 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.3.7 +fastapi==0.128.0 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.51.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.1 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +blinker==1.4 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4c0d1c10a2c226128277aab56eafecf114438c3b --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-64-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-04T09:03:21.035088Z", + "args": [ + "/workspace/v127rc_exp1/E_dup.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "9acfbb3ac08f", + "executable": "/usr/bin/python", + "cpu_count": 16, + "cpu_count_logical": 32, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2198335488" + } + }, + "memory": { + "total": "134123917312" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072" + } + ], + "cudaVersion": "12.9", + "writerId": "km795qg4wugx2xk47glqbs7x5abb2ilt" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ff2dae5859c3b4625f720e2f6d5b7d824d2761f5 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json @@ -0,0 +1 @@ +{"train_runtime":75825.2674,"train/num_input_tokens_seen":151989750,"_timestamp":1.7702716258520179e+09,"train/train_tokens_per_second":2004.516,"total_flos":6.94172372053248e+18,"train/epoch":5,"train/loss":0.02155970223248005,"train_loss":0.048330643215257464,"_runtime":75825,"train_steps_per_second":0.979,"train/global_step":74250,"train/learning_rate":2.3300469886855526e-14,"train/grad_norm":0.11816766858100891,"_step":74250,"_wandb":{"runtime":75825},"train_samples_per_second":0.979} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e063277124133ee358845070c78ee733f97d49ba --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-04T09:03:21.282329291Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-04T09:03:21.632244677Z","level":"INFO","msg":"stream: created new stream","id":"9xr67hqd"} +{"time":"2026-02-04T09:03:21.632659472Z","level":"INFO","msg":"handler: started","stream_id":"9xr67hqd"} +{"time":"2026-02-04T09:03:21.634880563Z","level":"INFO","msg":"stream: started","id":"9xr67hqd"} +{"time":"2026-02-04T09:03:21.634903075Z","level":"INFO","msg":"writer: started","stream_id":"9xr67hqd"} +{"time":"2026-02-04T09:03:21.634920297Z","level":"INFO","msg":"sender: started","stream_id":"9xr67hqd"} +{"time":"2026-02-05T00:58:07.192823728Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/9xr67hqd/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-05T06:07:07.926217033Z","level":"INFO","msg":"stream: closing","id":"9xr67hqd"} +{"time":"2026-02-05T06:07:09.870964601Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-05T06:07:10.109026941Z","level":"INFO","msg":"handler: closed","stream_id":"9xr67hqd"} +{"time":"2026-02-05T06:07:10.114497568Z","level":"INFO","msg":"sender: closed","stream_id":"9xr67hqd"} +{"time":"2026-02-05T06:07:10.114763144Z","level":"INFO","msg":"stream: closed","id":"9xr67hqd"} diff --git a/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3e300034cc21c704b2b4c137399396a94f8e8f93 --- /dev/null +++ b/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log @@ -0,0 +1,25 @@ +2026-02-04 09:03:21,055 INFO MainThread:4473 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_setup.py:_flush():81] Configure stats pid to 4473 +2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log +2026-02-04 09:03:21,057 INFO MainThread:4473 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log +2026-02-04 09:03:21,058 INFO MainThread:4473 [wandb_init.py:init():844] calling init triggers +2026-02-04 09:03:21,058 INFO MainThread:4473 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-04 09:03:21,059 INFO MainThread:4473 [wandb_init.py:init():892] starting backend +2026-02-04 09:03:21,273 INFO MainThread:4473 [wandb_init.py:init():895] sending inform_init request +2026-02-04 09:03:21,279 INFO MainThread:4473 [wandb_init.py:init():903] backend started and connected +2026-02-04 09:03:21,282 INFO MainThread:4473 [wandb_init.py:init():973] updated telemetry +2026-02-04 09:03:21,345 INFO MainThread:4473 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-04 09:03:21,944 INFO MainThread:4473 [wandb_init.py:init():1042] starting run threads in backend +2026-02-04 09:03:22,035 INFO MainThread:4473 [wandb_run.py:_console_start():2529] atexit reg +2026-02-04 09:03:22,035 INFO MainThread:4473 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-04 09:03:22,036 INFO MainThread:4473 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-04 09:03:22,036 INFO MainThread:4473 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-04 09:03:22,039 INFO MainThread:4473 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-04 09:03:22,040 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['up_proj', 'q_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/E_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-04 09:03:22,047 INFO MainThread:4473 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-04 09:03:22,048 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-04 09:03:22,050 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d119_r85'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2026-02-05 06:07:07,926 INFO wandb-AsyncioManager-main:4473 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-05 06:07:07,926 INFO wandb-AsyncioManager-main:4473 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..12c99cccbc1ef1bb5c8c581736edf6e670350ca3 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log @@ -0,0 +1,16454 @@ + 0%| | 0/198585 [00:00> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 02:54:41,432 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 02:54:41,948 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-1000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 02:54:41,954 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-1000/tokenizer_config.json + +{'loss': '1.313', 'grad_norm': '1.39', 'learning_rate': '1.259e-05', 'epoch': '0.0252', 'num_input_tokens_seen': 2049047, 'train_runtime': '1038', 'train_tokens_per_second': '1975'} +{'loss': '1.478', 'grad_norm': '1.325', 'learning_rate': '1.26e-05', 'epoch': '0.02523', 'num_input_tokens_seen': 2051094, 'train_runtime': '1039', 'train_tokens_per_second': '1975'} +{'loss': '0.5281', 'grad_norm': '0.7583', 'learning_rate': '1.261e-05', 'epoch': '0.02525', 'num_input_tokens_seen': 2053141, 'train_runtime': '1040', 'train_tokens_per_second': '1975'} +{'loss': '1.251', 'grad_norm': '1.202', 'learning_rate': '1.263e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 2055188, 'train_runtime': '1041', 'train_tokens_per_second': '1975'} +{'loss': '2.388', 'grad_norm': '1.892', 'learning_rate': '1.264e-05', 'epoch': '0.0253', 'num_input_tokens_seen': 2057235, 'train_runtime': '1042', 'train_tokens_per_second': '1975'} +{'loss': '0.6915', 'grad_norm': '0.8628', 'learning_rate': '1.265e-05', 'epoch': '0.02533', 'num_input_tokens_seen': 2059282, 'train_runtime': '1043', 'train_tokens_per_second': '1975'} +{'loss': '0.5689', 'grad_norm': '0.8442', 'learning_rate': '1.266e-05', 'epoch': '0.02535', 'num_input_tokens_seen': 2061329, 'train_runtime': '1044', 'train_tokens_per_second': '1975'} +{'loss': '0.5987', 'grad_norm': '0.9078', 'learning_rate': '1.268e-05', 'epoch': '0.02538', 'num_input_tokens_seen': 2063376, 'train_runtime': '1045', 'train_tokens_per_second': '1975'} +{'loss': '0.7887', 'grad_norm': '0.9004', 'learning_rate': '1.269e-05', 'epoch': '0.0254', 'num_input_tokens_seen': 2065423, 'train_runtime': '1046', 'train_tokens_per_second': '1975'} +{'loss': '2.14', 'grad_norm': '1.621', 'learning_rate': '1.27e-05', 'epoch': '0.02543', 'num_input_tokens_seen': 2067470, 'train_runtime': '1047', 'train_tokens_per_second': '1975'} +{'loss': '1.165', 'grad_norm': '1.156', 'learning_rate': '1.271e-05', 'epoch': '0.02546', 'num_input_tokens_seen': 2069517, 'train_runtime': '1048', 'train_tokens_per_second': '1975'} +{'loss': '1.93', 'grad_norm': '1.66', 'learning_rate': '1.273e-05', 'epoch': '0.02548', 'num_input_tokens_seen': 2071564, 'train_runtime': '1049', 'train_tokens_per_second': '1975'} +{'loss': '1.608', 'grad_norm': '1.557', 'learning_rate': '1.274e-05', 'epoch': '0.02551', 'num_input_tokens_seen': 2073611, 'train_runtime': '1050', 'train_tokens_per_second': '1975'} +{'loss': '1.338', 'grad_norm': '1.142', 'learning_rate': '1.275e-05', 'epoch': '0.02553', 'num_input_tokens_seen': 2075658, 'train_runtime': '1051', 'train_tokens_per_second': '1975'} +{'loss': '0.4819', 'grad_norm': '0.8666', 'learning_rate': '1.276e-05', 'epoch': '0.02556', 'num_input_tokens_seen': 2077705, 'train_runtime': '1052', 'train_tokens_per_second': '1975'} +{'loss': '1.373', 'grad_norm': '1.224', 'learning_rate': '1.278e-05', 'epoch': '0.02558', 'num_input_tokens_seen': 2079752, 'train_runtime': '1053', 'train_tokens_per_second': '1975'} +{'loss': '1.459', 'grad_norm': '1.294', 'learning_rate': '1.279e-05', 'epoch': '0.02561', 'num_input_tokens_seen': 2081799, 'train_runtime': '1054', 'train_tokens_per_second': '1975'} +{'loss': '1.264', 'grad_norm': '1.326', 'learning_rate': '1.28e-05', 'epoch': '0.02563', 'num_input_tokens_seen': 2083846, 'train_runtime': '1055', 'train_tokens_per_second': '1975'} +{'loss': '1.256', 'grad_norm': '1.166', 'learning_rate': '1.281e-05', 'epoch': '0.02566', 'num_input_tokens_seen': 2085893, 'train_runtime': '1056', 'train_tokens_per_second': '1975'} +{'loss': '1.72', 'grad_norm': '1.242', 'learning_rate': '1.283e-05', 'epoch': '0.02568', 'num_input_tokens_seen': 2087940, 'train_runtime': '1057', 'train_tokens_per_second': '1975'} +{'loss': '1.125', 'grad_norm': '1.054', 'learning_rate': '1.284e-05', 'epoch': '0.02571', 'num_input_tokens_seen': 2089987, 'train_runtime': '1058', 'train_tokens_per_second': '1975'} +{'loss': '1.85', 'grad_norm': '1.491', 'learning_rate': '1.285e-05', 'epoch': '0.02573', 'num_input_tokens_seen': 2092034, 'train_runtime': '1059', 'train_tokens_per_second': '1975'} +{'loss': '0.7417', 'grad_norm': '0.9822', 'learning_rate': '1.287e-05', 'epoch': '0.02576', 'num_input_tokens_seen': 2094081, 'train_runtime': '1060', 'train_tokens_per_second': '1975'} +{'loss': '0.7242', 'grad_norm': '1.059', 'learning_rate': '1.288e-05', 'epoch': '0.02578', 'num_input_tokens_seen': 2096128, 'train_runtime': '1061', 'train_tokens_per_second': '1975'} +{'loss': '1.996', 'grad_norm': '1.847', 'learning_rate': '1.289e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 2098175, 'train_runtime': '1062', 'train_tokens_per_second': '1975'} +{'loss': '2.759', 'grad_norm': '1.742', 'learning_rate': '1.29e-05', 'epoch': '0.02583', 'num_input_tokens_seen': 2100222, 'train_runtime': '1063', 'train_tokens_per_second': '1975'} +{'loss': '1.59', 'grad_norm': '1.098', 'learning_rate': '1.292e-05', 'epoch': '0.02586', 'num_input_tokens_seen': 2102269, 'train_runtime': '1064', 'train_tokens_per_second': '1975'} +{'loss': '0.5994', 'grad_norm': '0.9326', 'learning_rate': '1.293e-05', 'epoch': '0.02588', 'num_input_tokens_seen': 2104316, 'train_runtime': '1066', 'train_tokens_per_second': '1975'} +{'loss': '1.493', 'grad_norm': '1.318', 'learning_rate': '1.294e-05', 'epoch': '0.02591', 'num_input_tokens_seen': 2106363, 'train_runtime': '1067', 'train_tokens_per_second': '1975'} +{'loss': '0.8141', 'grad_norm': '1.063', 'learning_rate': '1.295e-05', 'epoch': '0.02593', 'num_input_tokens_seen': 2108410, 'train_runtime': '1068', 'train_tokens_per_second': '1975'} +{'loss': '1.297', 'grad_norm': '1.324', 'learning_rate': '1.297e-05', 'epoch': '0.02596', 'num_input_tokens_seen': 2110457, 'train_runtime': '1069', 'train_tokens_per_second': '1975'} +{'loss': '0.9154', 'grad_norm': '1.059', 'learning_rate': '1.298e-05', 'epoch': '0.02598', 'num_input_tokens_seen': 2112504, 'train_runtime': '1070', 'train_tokens_per_second': '1975'} +{'loss': '2.12', 'grad_norm': '1.612', 'learning_rate': '1.299e-05', 'epoch': '0.02601', 'num_input_tokens_seen': 2114551, 'train_runtime': '1071', 'train_tokens_per_second': '1975'} +{'loss': '0.6472', 'grad_norm': '0.7983', 'learning_rate': '1.3e-05', 'epoch': '0.02603', 'num_input_tokens_seen': 2116598, 'train_runtime': '1072', 'train_tokens_per_second': '1975'} +{'loss': '1.593', 'grad_norm': '1.401', 'learning_rate': '1.302e-05', 'epoch': '0.02606', 'num_input_tokens_seen': 2118645, 'train_runtime': '1073', 'train_tokens_per_second': '1975'} +{'loss': '1.449', 'grad_norm': '1.256', 'learning_rate': '1.303e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 2120692, 'train_runtime': '1074', 'train_tokens_per_second': '1975'} +{'loss': '0.5865', 'grad_norm': '0.8442', 'learning_rate': '1.304e-05', 'epoch': '0.02611', 'num_input_tokens_seen': 2122739, 'train_runtime': '1075', 'train_tokens_per_second': '1975'} +{'loss': '1.353', 'grad_norm': '1.435', 'learning_rate': '1.305e-05', 'epoch': '0.02613', 'num_input_tokens_seen': 2124786, 'train_runtime': '1076', 'train_tokens_per_second': '1975'} +{'loss': '0.8841', 'grad_norm': '0.9037', 'learning_rate': '1.307e-05', 'epoch': '0.02616', 'num_input_tokens_seen': 2126833, 'train_runtime': '1077', 'train_tokens_per_second': '1975'} +{'loss': '3.032', 'grad_norm': '2.099', 'learning_rate': '1.308e-05', 'epoch': '0.02619', 'num_input_tokens_seen': 2128880, 'train_runtime': '1078', 'train_tokens_per_second': '1975'} +{'loss': '1.413', 'grad_norm': '1.482', 'learning_rate': '1.309e-05', 'epoch': '0.02621', 'num_input_tokens_seen': 2130927, 'train_runtime': '1079', 'train_tokens_per_second': '1975'} +{'loss': '0.7889', 'grad_norm': '0.8774', 'learning_rate': '1.31e-05', 'epoch': '0.02624', 'num_input_tokens_seen': 2132974, 'train_runtime': '1080', 'train_tokens_per_second': '1975'} +{'loss': '0.5123', 'grad_norm': '0.8039', 'learning_rate': '1.312e-05', 'epoch': '0.02626', 'num_input_tokens_seen': 2135021, 'train_runtime': '1081', 'train_tokens_per_second': '1975'} +{'loss': '2.016', 'grad_norm': '1.675', 'learning_rate': '1.313e-05', 'epoch': '0.02629', 'num_input_tokens_seen': 2137068, 'train_runtime': '1082', 'train_tokens_per_second': '1975'} +{'loss': '1.41', 'grad_norm': '1.423', 'learning_rate': '1.314e-05', 'epoch': '0.02631', 'num_input_tokens_seen': 2139115, 'train_runtime': '1083', 'train_tokens_per_second': '1975'} +{'loss': '1.133', 'grad_norm': '1.319', 'learning_rate': '1.315e-05', 'epoch': '0.02634', 'num_input_tokens_seen': 2141162, 'train_runtime': '1084', 'train_tokens_per_second': '1975'} +{'loss': '1.716', 'grad_norm': '2.065', 'learning_rate': '1.317e-05', 'epoch': '0.02636', 'num_input_tokens_seen': 2143209, 'train_runtime': '1085', 'train_tokens_per_second': '1975'} +{'loss': '1.171', 'grad_norm': '1.253', 'learning_rate': '1.318e-05', 'epoch': '0.02639', 'num_input_tokens_seen': 2145256, 'train_runtime': '1086', 'train_tokens_per_second': '1975'} +{'loss': '0.4851', 'grad_norm': '0.9227', 'learning_rate': '1.319e-05', 'epoch': '0.02641', 'num_input_tokens_seen': 2147303, 'train_runtime': '1087', 'train_tokens_per_second': '1975'} +{'loss': '1.503', 'grad_norm': '1.503', 'learning_rate': '1.32e-05', 'epoch': '0.02644', 'num_input_tokens_seen': 2149350, 'train_runtime': '1088', 'train_tokens_per_second': '1975'} +{'loss': '1.412', 'grad_norm': '1.343', 'learning_rate': '1.322e-05', 'epoch': '0.02646', 'num_input_tokens_seen': 2151397, 'train_runtime': '1089', 'train_tokens_per_second': '1975'} +{'loss': '0.7274', 'grad_norm': '0.9788', 'learning_rate': '1.323e-05', 'epoch': '0.02649', 'num_input_tokens_seen': 2153444, 'train_runtime': '1090', 'train_tokens_per_second': '1975'} +{'loss': '1.678', 'grad_norm': '1.38', 'learning_rate': '1.324e-05', 'epoch': '0.02651', 'num_input_tokens_seen': 2155491, 'train_runtime': '1091', 'train_tokens_per_second': '1975'} +{'loss': '0.5097', 'grad_norm': '0.7451', 'learning_rate': '1.326e-05', 'epoch': '0.02654', 'num_input_tokens_seen': 2157538, 'train_runtime': '1092', 'train_tokens_per_second': '1975'} +{'loss': '1.239', 'grad_norm': '1.332', 'learning_rate': '1.327e-05', 'epoch': '0.02656', 'num_input_tokens_seen': 2159585, 'train_runtime': '1093', 'train_tokens_per_second': '1975'} +{'loss': '0.6258', 'grad_norm': '0.8507', 'learning_rate': '1.328e-05', 'epoch': '0.02659', 'num_input_tokens_seen': 2161632, 'train_runtime': '1095', 'train_tokens_per_second': '1975'} +{'loss': '0.5774', 'grad_norm': '0.8793', 'learning_rate': '1.329e-05', 'epoch': '0.02661', 'num_input_tokens_seen': 2163679, 'train_runtime': '1096', 'train_tokens_per_second': '1975'} +{'loss': '0.6061', 'grad_norm': '0.9887', 'learning_rate': '1.331e-05', 'epoch': '0.02664', 'num_input_tokens_seen': 2165726, 'train_runtime': '1097', 'train_tokens_per_second': '1975'} +{'loss': '1.395', 'grad_norm': '1.273', 'learning_rate': '1.332e-05', 'epoch': '0.02666', 'num_input_tokens_seen': 2167773, 'train_runtime': '1098', 'train_tokens_per_second': '1975'} +{'loss': '1.462', 'grad_norm': '1.346', 'learning_rate': '1.333e-05', 'epoch': '0.02669', 'num_input_tokens_seen': 2169820, 'train_runtime': '1099', 'train_tokens_per_second': '1975'} +{'loss': '1.516', 'grad_norm': '1.379', 'learning_rate': '1.334e-05', 'epoch': '0.02671', 'num_input_tokens_seen': 2171867, 'train_runtime': '1100', 'train_tokens_per_second': '1975'} +{'loss': '0.5392', 'grad_norm': '0.7493', 'learning_rate': '1.336e-05', 'epoch': '0.02674', 'num_input_tokens_seen': 2173914, 'train_runtime': '1101', 'train_tokens_per_second': '1975'} +{'loss': '1.682', 'grad_norm': '1.604', 'learning_rate': '1.337e-05', 'epoch': '0.02676', 'num_input_tokens_seen': 2175961, 'train_runtime': '1102', 'train_tokens_per_second': '1975'} +{'loss': '1.475', 'grad_norm': '1.426', 'learning_rate': '1.338e-05', 'epoch': '0.02679', 'num_input_tokens_seen': 2178008, 'train_runtime': '1103', 'train_tokens_per_second': '1975'} +{'loss': '2.118', 'grad_norm': '1.866', 'learning_rate': '1.339e-05', 'epoch': '0.02681', 'num_input_tokens_seen': 2180055, 'train_runtime': '1104', 'train_tokens_per_second': '1975'} +{'loss': '0.5168', 'grad_norm': '0.8889', 'learning_rate': '1.341e-05', 'epoch': '0.02684', 'num_input_tokens_seen': 2182102, 'train_runtime': '1105', 'train_tokens_per_second': '1975'} +{'loss': '0.6566', 'grad_norm': '1.043', 'learning_rate': '1.342e-05', 'epoch': '0.02687', 'num_input_tokens_seen': 2184149, 'train_runtime': '1106', 'train_tokens_per_second': '1975'} +{'loss': '1.02', 'grad_norm': '1.295', 'learning_rate': '1.343e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 2186196, 'train_runtime': '1107', 'train_tokens_per_second': '1975'} +{'loss': '1.823', 'grad_norm': '1.575', 'learning_rate': '1.344e-05', 'epoch': '0.02692', 'num_input_tokens_seen': 2188243, 'train_runtime': '1108', 'train_tokens_per_second': '1975'} +{'loss': '0.5143', 'grad_norm': '0.9965', 'learning_rate': '1.346e-05', 'epoch': '0.02694', 'num_input_tokens_seen': 2190290, 'train_runtime': '1109', 'train_tokens_per_second': '1975'} +{'loss': '0.6422', 'grad_norm': '1.011', 'learning_rate': '1.347e-05', 'epoch': '0.02697', 'num_input_tokens_seen': 2192337, 'train_runtime': '1110', 'train_tokens_per_second': '1975'} +{'loss': '0.8245', 'grad_norm': '2.175', 'learning_rate': '1.348e-05', 'epoch': '0.02699', 'num_input_tokens_seen': 2194384, 'train_runtime': '1111', 'train_tokens_per_second': '1975'} +{'loss': '1.507', 'grad_norm': '1.538', 'learning_rate': '1.349e-05', 'epoch': '0.02702', 'num_input_tokens_seen': 2196431, 'train_runtime': '1112', 'train_tokens_per_second': '1975'} +{'loss': '0.521', 'grad_norm': '0.869', 'learning_rate': '1.351e-05', 'epoch': '0.02704', 'num_input_tokens_seen': 2198478, 'train_runtime': '1113', 'train_tokens_per_second': '1975'} +{'loss': '1.912', 'grad_norm': '1.494', 'learning_rate': '1.352e-05', 'epoch': '0.02707', 'num_input_tokens_seen': 2200525, 'train_runtime': '1114', 'train_tokens_per_second': '1975'} +{'loss': '1.393', 'grad_norm': '1.402', 'learning_rate': '1.353e-05', 'epoch': '0.02709', 'num_input_tokens_seen': 2202572, 'train_runtime': '1115', 'train_tokens_per_second': '1975'} +{'loss': '1.317', 'grad_norm': '1.334', 'learning_rate': '1.354e-05', 'epoch': '0.02712', 'num_input_tokens_seen': 2204619, 'train_runtime': '1116', 'train_tokens_per_second': '1975'} +{'loss': '2.802', 'grad_norm': '2.691', 'learning_rate': '1.356e-05', 'epoch': '0.02714', 'num_input_tokens_seen': 2206666, 'train_runtime': '1117', 'train_tokens_per_second': '1975'} +{'loss': '0.5218', 'grad_norm': '0.6329', 'learning_rate': '1.357e-05', 'epoch': '0.02717', 'num_input_tokens_seen': 2208713, 'train_runtime': '1118', 'train_tokens_per_second': '1975'} +{'loss': '1.918', 'grad_norm': '1.536', 'learning_rate': '1.358e-05', 'epoch': '0.02719', 'num_input_tokens_seen': 2210760, 'train_runtime': '1119', 'train_tokens_per_second': '1975'} +{'loss': '1.151', 'grad_norm': '1.279', 'learning_rate': '1.36e-05', 'epoch': '0.02722', 'num_input_tokens_seen': 2212807, 'train_runtime': '1120', 'train_tokens_per_second': '1975'} +{'loss': '3.05', 'grad_norm': '2.108', 'learning_rate': '1.361e-05', 'epoch': '0.02724', 'num_input_tokens_seen': 2214854, 'train_runtime': '1121', 'train_tokens_per_second': '1975'} +{'loss': '1.461', 'grad_norm': '1.55', 'learning_rate': '1.362e-05', 'epoch': '0.02727', 'num_input_tokens_seen': 2216901, 'train_runtime': '1122', 'train_tokens_per_second': '1975'} +{'loss': '0.6424', 'grad_norm': '0.9486', 'learning_rate': '1.363e-05', 'epoch': '0.02729', 'num_input_tokens_seen': 2218948, 'train_runtime': '1123', 'train_tokens_per_second': '1975'} +{'loss': '1.099', 'grad_norm': '1.23', 'learning_rate': '1.365e-05', 'epoch': '0.02732', 'num_input_tokens_seen': 2220995, 'train_runtime': '1125', 'train_tokens_per_second': '1975'} +{'loss': '2.238', 'grad_norm': '3.058', 'learning_rate': '1.366e-05', 'epoch': '0.02734', 'num_input_tokens_seen': 2223042, 'train_runtime': '1126', 'train_tokens_per_second': '1975'} +{'loss': '2.311', 'grad_norm': '1.534', 'learning_rate': '1.367e-05', 'epoch': '0.02737', 'num_input_tokens_seen': 2225089, 'train_runtime': '1127', 'train_tokens_per_second': '1975'} +{'loss': '2.195', 'grad_norm': '1.65', 'learning_rate': '1.368e-05', 'epoch': '0.02739', 'num_input_tokens_seen': 2227136, 'train_runtime': '1128', 'train_tokens_per_second': '1975'} +{'loss': '0.5158', 'grad_norm': '0.8642', 'learning_rate': '1.37e-05', 'epoch': '0.02742', 'num_input_tokens_seen': 2229183, 'train_runtime': '1129', 'train_tokens_per_second': '1975'} +{'loss': '1.363', 'grad_norm': '1.53', 'learning_rate': '1.371e-05', 'epoch': '0.02744', 'num_input_tokens_seen': 2231230, 'train_runtime': '1130', 'train_tokens_per_second': '1975'} +{'loss': '1.435', 'grad_norm': '1.533', 'learning_rate': '1.372e-05', 'epoch': '0.02747', 'num_input_tokens_seen': 2233277, 'train_runtime': '1131', 'train_tokens_per_second': '1975'} +{'loss': '1.217', 'grad_norm': '1.418', 'learning_rate': '1.373e-05', 'epoch': '0.02749', 'num_input_tokens_seen': 2235324, 'train_runtime': '1132', 'train_tokens_per_second': '1975'} +{'loss': '2.122', 'grad_norm': '1.994', 'learning_rate': '1.375e-05', 'epoch': '0.02752', 'num_input_tokens_seen': 2237371, 'train_runtime': '1133', 'train_tokens_per_second': '1975'} +{'loss': '0.5194', 'grad_norm': '0.889', 'learning_rate': '1.376e-05', 'epoch': '0.02754', 'num_input_tokens_seen': 2239418, 'train_runtime': '1134', 'train_tokens_per_second': '1975'} +{'loss': '0.7421', 'grad_norm': '1.108', 'learning_rate': '1.377e-05', 'epoch': '0.02757', 'num_input_tokens_seen': 2241465, 'train_runtime': '1135', 'train_tokens_per_second': '1975'} +{'loss': '2.362', 'grad_norm': '1.887', 'learning_rate': '1.378e-05', 'epoch': '0.0276', 'num_input_tokens_seen': 2243512, 'train_runtime': '1136', 'train_tokens_per_second': '1975'} +{'loss': '1.664', 'grad_norm': '1.433', 'learning_rate': '1.38e-05', 'epoch': '0.02762', 'num_input_tokens_seen': 2245559, 'train_runtime': '1137', 'train_tokens_per_second': '1975'} +{'loss': '1.027', 'grad_norm': '1.352', 'learning_rate': '1.381e-05', 'epoch': '0.02765', 'num_input_tokens_seen': 2247606, 'train_runtime': '1138', 'train_tokens_per_second': '1975'} +{'loss': '1.45', 'grad_norm': '1.338', 'learning_rate': '1.382e-05', 'epoch': '0.02767', 'num_input_tokens_seen': 2249653, 'train_runtime': '1139', 'train_tokens_per_second': '1975'} +{'loss': '0.9927', 'grad_norm': '1.266', 'learning_rate': '1.383e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 2251700, 'train_runtime': '1140', 'train_tokens_per_second': '1975'} +{'loss': '1.107', 'grad_norm': '1.214', 'learning_rate': '1.385e-05', 'epoch': '0.02772', 'num_input_tokens_seen': 2253747, 'train_runtime': '1141', 'train_tokens_per_second': '1975'} +{'loss': '1.975', 'grad_norm': '1.806', 'learning_rate': '1.386e-05', 'epoch': '0.02775', 'num_input_tokens_seen': 2255794, 'train_runtime': '1142', 'train_tokens_per_second': '1975'} +{'loss': '0.6395', 'grad_norm': '0.9664', 'learning_rate': '1.387e-05', 'epoch': '0.02777', 'num_input_tokens_seen': 2257841, 'train_runtime': '1143', 'train_tokens_per_second': '1975'} +{'loss': '1.599', 'grad_norm': '1.533', 'learning_rate': '1.388e-05', 'epoch': '0.0278', 'num_input_tokens_seen': 2259888, 'train_runtime': '1144', 'train_tokens_per_second': '1975'} +{'loss': '1.327', 'grad_norm': '1.473', 'learning_rate': '1.39e-05', 'epoch': '0.02782', 'num_input_tokens_seen': 2261935, 'train_runtime': '1145', 'train_tokens_per_second': '1975'} +{'loss': '1.668', 'grad_norm': '1.656', 'learning_rate': '1.391e-05', 'epoch': '0.02785', 'num_input_tokens_seen': 2263982, 'train_runtime': '1146', 'train_tokens_per_second': '1975'} +{'loss': '0.6981', 'grad_norm': '0.9939', 'learning_rate': '1.392e-05', 'epoch': '0.02787', 'num_input_tokens_seen': 2266029, 'train_runtime': '1147', 'train_tokens_per_second': '1975'} +{'loss': '0.7739', 'grad_norm': '0.9959', 'learning_rate': '1.394e-05', 'epoch': '0.0279', 'num_input_tokens_seen': 2268076, 'train_runtime': '1148', 'train_tokens_per_second': '1975'} +{'loss': '0.6236', 'grad_norm': '1.065', 'learning_rate': '1.395e-05', 'epoch': '0.02792', 'num_input_tokens_seen': 2270123, 'train_runtime': '1149', 'train_tokens_per_second': '1975'} +{'loss': '1.538', 'grad_norm': '5.787', 'learning_rate': '1.396e-05', 'epoch': '0.02795', 'num_input_tokens_seen': 2272170, 'train_runtime': '1150', 'train_tokens_per_second': '1975'} +{'loss': '1.918', 'grad_norm': '1.681', 'learning_rate': '1.397e-05', 'epoch': '0.02797', 'num_input_tokens_seen': 2274217, 'train_runtime': '1151', 'train_tokens_per_second': '1975'} +{'loss': '1.515', 'grad_norm': '1.414', 'learning_rate': '1.399e-05', 'epoch': '0.028', 'num_input_tokens_seen': 2276264, 'train_runtime': '1152', 'train_tokens_per_second': '1975'} +{'loss': '1.624', 'grad_norm': '1.767', 'learning_rate': '1.4e-05', 'epoch': '0.02802', 'num_input_tokens_seen': 2278311, 'train_runtime': '1153', 'train_tokens_per_second': '1975'} +{'loss': '0.6883', 'grad_norm': '0.9009', 'learning_rate': '1.401e-05', 'epoch': '0.02805', 'num_input_tokens_seen': 2280358, 'train_runtime': '1155', 'train_tokens_per_second': '1975'} +{'loss': '0.579', 'grad_norm': '1.122', 'learning_rate': '1.402e-05', 'epoch': '0.02807', 'num_input_tokens_seen': 2282405, 'train_runtime': '1156', 'train_tokens_per_second': '1975'} +{'loss': '2.972', 'grad_norm': '2.027', 'learning_rate': '1.404e-05', 'epoch': '0.0281', 'num_input_tokens_seen': 2284452, 'train_runtime': '1157', 'train_tokens_per_second': '1975'} +{'loss': '1.431', 'grad_norm': '1.421', 'learning_rate': '1.405e-05', 'epoch': '0.02812', 'num_input_tokens_seen': 2286499, 'train_runtime': '1158', 'train_tokens_per_second': '1975'} +{'loss': '0.5202', 'grad_norm': '0.9286', 'learning_rate': '1.406e-05', 'epoch': '0.02815', 'num_input_tokens_seen': 2288546, 'train_runtime': '1159', 'train_tokens_per_second': '1975'} +{'loss': '1.343', 'grad_norm': '1.48', 'learning_rate': '1.407e-05', 'epoch': '0.02817', 'num_input_tokens_seen': 2290593, 'train_runtime': '1160', 'train_tokens_per_second': '1975'} +{'loss': '1.251', 'grad_norm': '1.551', 'learning_rate': '1.409e-05', 'epoch': '0.0282', 'num_input_tokens_seen': 2292640, 'train_runtime': '1161', 'train_tokens_per_second': '1975'} +{'loss': '1.357', 'grad_norm': '1.698', 'learning_rate': '1.41e-05', 'epoch': '0.02822', 'num_input_tokens_seen': 2294687, 'train_runtime': '1162', 'train_tokens_per_second': '1975'} +{'loss': '1.9', 'grad_norm': '2.025', 'learning_rate': '1.411e-05', 'epoch': '0.02825', 'num_input_tokens_seen': 2296734, 'train_runtime': '1163', 'train_tokens_per_second': '1975'} +{'loss': '0.8015', 'grad_norm': '1.064', 'learning_rate': '1.412e-05', 'epoch': '0.02828', 'num_input_tokens_seen': 2298781, 'train_runtime': '1164', 'train_tokens_per_second': '1975'} +{'loss': '1.751', 'grad_norm': '1.663', 'learning_rate': '1.414e-05', 'epoch': '0.0283', 'num_input_tokens_seen': 2300828, 'train_runtime': '1165', 'train_tokens_per_second': '1975'} +{'loss': '0.9412', 'grad_norm': '1.085', 'learning_rate': '1.415e-05', 'epoch': '0.02833', 'num_input_tokens_seen': 2302875, 'train_runtime': '1166', 'train_tokens_per_second': '1975'} +{'loss': '0.4507', 'grad_norm': '0.8878', 'learning_rate': '1.416e-05', 'epoch': '0.02835', 'num_input_tokens_seen': 2304922, 'train_runtime': '1167', 'train_tokens_per_second': '1975'} +{'loss': '0.9554', 'grad_norm': '1.168', 'learning_rate': '1.417e-05', 'epoch': '0.02838', 'num_input_tokens_seen': 2306969, 'train_runtime': '1168', 'train_tokens_per_second': '1975'} +{'loss': '0.555', 'grad_norm': '0.9121', 'learning_rate': '1.419e-05', 'epoch': '0.0284', 'num_input_tokens_seen': 2309016, 'train_runtime': '1169', 'train_tokens_per_second': '1975'} +{'loss': '1.204', 'grad_norm': '1.402', 'learning_rate': '1.42e-05', 'epoch': '0.02843', 'num_input_tokens_seen': 2311063, 'train_runtime': '1170', 'train_tokens_per_second': '1975'} +{'loss': '1.065', 'grad_norm': '1.258', 'learning_rate': '1.421e-05', 'epoch': '0.02845', 'num_input_tokens_seen': 2313110, 'train_runtime': '1171', 'train_tokens_per_second': '1975'} +{'loss': '1.547', 'grad_norm': '1.854', 'learning_rate': '1.422e-05', 'epoch': '0.02848', 'num_input_tokens_seen': 2315157, 'train_runtime': '1172', 'train_tokens_per_second': '1975'} +{'loss': '1.026', 'grad_norm': '1.24', 'learning_rate': '1.424e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 2317204, 'train_runtime': '1173', 'train_tokens_per_second': '1975'} +{'loss': '1.544', 'grad_norm': '1.744', 'learning_rate': '1.425e-05', 'epoch': '0.02853', 'num_input_tokens_seen': 2319251, 'train_runtime': '1174', 'train_tokens_per_second': '1975'} +{'loss': '1.037', 'grad_norm': '1.239', 'learning_rate': '1.426e-05', 'epoch': '0.02855', 'num_input_tokens_seen': 2321298, 'train_runtime': '1175', 'train_tokens_per_second': '1975'} +{'loss': '2.508', 'grad_norm': '2.065', 'learning_rate': '1.427e-05', 'epoch': '0.02858', 'num_input_tokens_seen': 2323345, 'train_runtime': '1176', 'train_tokens_per_second': '1975'} +{'loss': '1.164', 'grad_norm': '1.531', 'learning_rate': '1.429e-05', 'epoch': '0.0286', 'num_input_tokens_seen': 2325392, 'train_runtime': '1177', 'train_tokens_per_second': '1975'} +{'loss': '0.61', 'grad_norm': '1.113', 'learning_rate': '1.43e-05', 'epoch': '0.02863', 'num_input_tokens_seen': 2327439, 'train_runtime': '1178', 'train_tokens_per_second': '1975'} +{'loss': '0.8584', 'grad_norm': '1.187', 'learning_rate': '1.431e-05', 'epoch': '0.02865', 'num_input_tokens_seen': 2329486, 'train_runtime': '1179', 'train_tokens_per_second': '1975'} +{'loss': '1.607', 'grad_norm': '1.778', 'learning_rate': '1.433e-05', 'epoch': '0.02868', 'num_input_tokens_seen': 2331533, 'train_runtime': '1180', 'train_tokens_per_second': '1975'} +{'loss': '1.116', 'grad_norm': '1.367', 'learning_rate': '1.434e-05', 'epoch': '0.0287', 'num_input_tokens_seen': 2333580, 'train_runtime': '1181', 'train_tokens_per_second': '1975'} +{'loss': '1.836', 'grad_norm': '1.938', 'learning_rate': '1.435e-05', 'epoch': '0.02873', 'num_input_tokens_seen': 2335627, 'train_runtime': '1182', 'train_tokens_per_second': '1975'} +{'loss': '0.658', 'grad_norm': '1.103', 'learning_rate': '1.436e-05', 'epoch': '0.02875', 'num_input_tokens_seen': 2337674, 'train_runtime': '1184', 'train_tokens_per_second': '1975'} +{'loss': '1.742', 'grad_norm': '1.745', 'learning_rate': '1.438e-05', 'epoch': '0.02878', 'num_input_tokens_seen': 2339721, 'train_runtime': '1185', 'train_tokens_per_second': '1975'} +{'loss': '1.147', 'grad_norm': '1.48', 'learning_rate': '1.439e-05', 'epoch': '0.0288', 'num_input_tokens_seen': 2341768, 'train_runtime': '1186', 'train_tokens_per_second': '1975'} +{'loss': '1.015', 'grad_norm': '1.405', 'learning_rate': '1.44e-05', 'epoch': '0.02883', 'num_input_tokens_seen': 2343815, 'train_runtime': '1187', 'train_tokens_per_second': '1975'} +{'loss': '2.09', 'grad_norm': '1.73', 'learning_rate': '1.441e-05', 'epoch': '0.02885', 'num_input_tokens_seen': 2345862, 'train_runtime': '1188', 'train_tokens_per_second': '1975'} +{'loss': '0.7343', 'grad_norm': '1.222', 'learning_rate': '1.443e-05', 'epoch': '0.02888', 'num_input_tokens_seen': 2347909, 'train_runtime': '1189', 'train_tokens_per_second': '1975'} +{'loss': '1.634', 'grad_norm': '1.598', 'learning_rate': '1.444e-05', 'epoch': '0.0289', 'num_input_tokens_seen': 2349956, 'train_runtime': '1190', 'train_tokens_per_second': '1975'} +{'loss': '0.5132', 'grad_norm': '1.043', 'learning_rate': '1.445e-05', 'epoch': '0.02893', 'num_input_tokens_seen': 2352003, 'train_runtime': '1191', 'train_tokens_per_second': '1975'} +{'loss': '2.983', 'grad_norm': '2.301', 'learning_rate': '1.446e-05', 'epoch': '0.02895', 'num_input_tokens_seen': 2354050, 'train_runtime': '1192', 'train_tokens_per_second': '1975'} +{'loss': '1.072', 'grad_norm': '1.594', 'learning_rate': '1.448e-05', 'epoch': '0.02898', 'num_input_tokens_seen': 2356097, 'train_runtime': '1193', 'train_tokens_per_second': '1975'} +{'loss': '0.6918', 'grad_norm': '1.136', 'learning_rate': '1.449e-05', 'epoch': '0.02901', 'num_input_tokens_seen': 2358144, 'train_runtime': '1194', 'train_tokens_per_second': '1975'} +{'loss': '1.494', 'grad_norm': '1.547', 'learning_rate': '1.45e-05', 'epoch': '0.02903', 'num_input_tokens_seen': 2360191, 'train_runtime': '1195', 'train_tokens_per_second': '1975'} +{'loss': '1.661', 'grad_norm': '1.724', 'learning_rate': '1.451e-05', 'epoch': '0.02906', 'num_input_tokens_seen': 2362238, 'train_runtime': '1196', 'train_tokens_per_second': '1975'} +{'loss': '1.376', 'grad_norm': '1.639', 'learning_rate': '1.453e-05', 'epoch': '0.02908', 'num_input_tokens_seen': 2364285, 'train_runtime': '1197', 'train_tokens_per_second': '1975'} +{'loss': '1.099', 'grad_norm': '1.526', 'learning_rate': '1.454e-05', 'epoch': '0.02911', 'num_input_tokens_seen': 2366332, 'train_runtime': '1198', 'train_tokens_per_second': '1975'} +{'loss': '0.9282', 'grad_norm': '3.358', 'learning_rate': '1.455e-05', 'epoch': '0.02913', 'num_input_tokens_seen': 2368379, 'train_runtime': '1199', 'train_tokens_per_second': '1975'} +{'loss': '1.954', 'grad_norm': '1.905', 'learning_rate': '1.456e-05', 'epoch': '0.02916', 'num_input_tokens_seen': 2370426, 'train_runtime': '1200', 'train_tokens_per_second': '1975'} +{'loss': '0.5819', 'grad_norm': '1.115', 'learning_rate': '1.458e-05', 'epoch': '0.02918', 'num_input_tokens_seen': 2372473, 'train_runtime': '1201', 'train_tokens_per_second': '1975'} +{'loss': '0.6383', 'grad_norm': '1.156', 'learning_rate': '1.459e-05', 'epoch': '0.02921', 'num_input_tokens_seen': 2374520, 'train_runtime': '1202', 'train_tokens_per_second': '1975'} +{'loss': '0.8135', 'grad_norm': '1.088', 'learning_rate': '1.46e-05', 'epoch': '0.02923', 'num_input_tokens_seen': 2376567, 'train_runtime': '1203', 'train_tokens_per_second': '1975'} +{'loss': '0.5948', 'grad_norm': '1.316', 'learning_rate': '1.461e-05', 'epoch': '0.02926', 'num_input_tokens_seen': 2378614, 'train_runtime': '1204', 'train_tokens_per_second': '1975'} +{'loss': '0.5948', 'grad_norm': '1.083', 'learning_rate': '1.463e-05', 'epoch': '0.02928', 'num_input_tokens_seen': 2380661, 'train_runtime': '1205', 'train_tokens_per_second': '1975'} +{'loss': '1.273', 'grad_norm': '1.353', 'learning_rate': '1.464e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 2382708, 'train_runtime': '1206', 'train_tokens_per_second': '1975'} +{'loss': '1.749', 'grad_norm': '1.653', 'learning_rate': '1.465e-05', 'epoch': '0.02933', 'num_input_tokens_seen': 2384755, 'train_runtime': '1207', 'train_tokens_per_second': '1975'} +{'loss': '1.07', 'grad_norm': '1.477', 'learning_rate': '1.467e-05', 'epoch': '0.02936', 'num_input_tokens_seen': 2386802, 'train_runtime': '1208', 'train_tokens_per_second': '1975'} +{'loss': '1.672', 'grad_norm': '2.117', 'learning_rate': '1.468e-05', 'epoch': '0.02938', 'num_input_tokens_seen': 2388849, 'train_runtime': '1209', 'train_tokens_per_second': '1975'} +{'loss': '1.622', 'grad_norm': '1.786', 'learning_rate': '1.469e-05', 'epoch': '0.02941', 'num_input_tokens_seen': 2390896, 'train_runtime': '1210', 'train_tokens_per_second': '1975'} +{'loss': '0.6458', 'grad_norm': '1.14', 'learning_rate': '1.47e-05', 'epoch': '0.02943', 'num_input_tokens_seen': 2392943, 'train_runtime': '1212', 'train_tokens_per_second': '1975'} +{'loss': '3.683', 'grad_norm': '2.192', 'learning_rate': '1.472e-05', 'epoch': '0.02946', 'num_input_tokens_seen': 2394990, 'train_runtime': '1213', 'train_tokens_per_second': '1975'} +{'loss': '1.096', 'grad_norm': '1.356', 'learning_rate': '1.473e-05', 'epoch': '0.02948', 'num_input_tokens_seen': 2397037, 'train_runtime': '1214', 'train_tokens_per_second': '1975'} +{'loss': '1.318', 'grad_norm': '1.703', 'learning_rate': '1.474e-05', 'epoch': '0.02951', 'num_input_tokens_seen': 2399084, 'train_runtime': '1215', 'train_tokens_per_second': '1975'} +{'loss': '1.607', 'grad_norm': '1.966', 'learning_rate': '1.475e-05', 'epoch': '0.02953', 'num_input_tokens_seen': 2401131, 'train_runtime': '1216', 'train_tokens_per_second': '1975'} +{'loss': '1.552', 'grad_norm': '1.59', 'learning_rate': '1.477e-05', 'epoch': '0.02956', 'num_input_tokens_seen': 2403178, 'train_runtime': '1217', 'train_tokens_per_second': '1975'} +{'loss': '1.007', 'grad_norm': '1.444', 'learning_rate': '1.478e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 2405225, 'train_runtime': '1218', 'train_tokens_per_second': '1975'} +{'loss': '1.572', 'grad_norm': '1.653', 'learning_rate': '1.479e-05', 'epoch': '0.02961', 'num_input_tokens_seen': 2407272, 'train_runtime': '1219', 'train_tokens_per_second': '1975'} +{'loss': '0.5015', 'grad_norm': '0.9343', 'learning_rate': '1.48e-05', 'epoch': '0.02963', 'num_input_tokens_seen': 2409319, 'train_runtime': '1220', 'train_tokens_per_second': '1975'} +{'loss': '0.9988', 'grad_norm': '1.22', 'learning_rate': '1.482e-05', 'epoch': '0.02966', 'num_input_tokens_seen': 2411366, 'train_runtime': '1221', 'train_tokens_per_second': '1975'} +{'loss': '1.61', 'grad_norm': '1.845', 'learning_rate': '1.483e-05', 'epoch': '0.02969', 'num_input_tokens_seen': 2413413, 'train_runtime': '1222', 'train_tokens_per_second': '1975'} +{'loss': '0.4367', 'grad_norm': '0.8149', 'learning_rate': '1.484e-05', 'epoch': '0.02971', 'num_input_tokens_seen': 2415460, 'train_runtime': '1223', 'train_tokens_per_second': '1975'} +{'loss': '0.5599', 'grad_norm': '1.073', 'learning_rate': '1.485e-05', 'epoch': '0.02974', 'num_input_tokens_seen': 2417507, 'train_runtime': '1224', 'train_tokens_per_second': '1975'} +{'loss': '1.627', 'grad_norm': '1.907', 'learning_rate': '1.487e-05', 'epoch': '0.02976', 'num_input_tokens_seen': 2419554, 'train_runtime': '1225', 'train_tokens_per_second': '1975'} +{'loss': '0.6666', 'grad_norm': '1.112', 'learning_rate': '1.488e-05', 'epoch': '0.02979', 'num_input_tokens_seen': 2421601, 'train_runtime': '1226', 'train_tokens_per_second': '1975'} +{'loss': '0.9688', 'grad_norm': '1.602', 'learning_rate': '1.489e-05', 'epoch': '0.02981', 'num_input_tokens_seen': 2423648, 'train_runtime': '1227', 'train_tokens_per_second': '1975'} +{'loss': '1.583', 'grad_norm': '1.767', 'learning_rate': '1.49e-05', 'epoch': '0.02984', 'num_input_tokens_seen': 2425695, 'train_runtime': '1228', 'train_tokens_per_second': '1975'} +{'loss': '1.276', 'grad_norm': '1.707', 'learning_rate': '1.492e-05', 'epoch': '0.02986', 'num_input_tokens_seen': 2427742, 'train_runtime': '1229', 'train_tokens_per_second': '1975'} +{'loss': '0.5003', 'grad_norm': '0.8164', 'learning_rate': '1.493e-05', 'epoch': '0.02989', 'num_input_tokens_seen': 2429789, 'train_runtime': '1230', 'train_tokens_per_second': '1975'} +{'loss': '0.7324', 'grad_norm': '1.412', 'learning_rate': '1.494e-05', 'epoch': '0.02991', 'num_input_tokens_seen': 2431836, 'train_runtime': '1231', 'train_tokens_per_second': '1975'} +{'loss': '0.5689', 'grad_norm': '1.15', 'learning_rate': '1.495e-05', 'epoch': '0.02994', 'num_input_tokens_seen': 2433883, 'train_runtime': '1232', 'train_tokens_per_second': '1975'} +{'loss': '1.992', 'grad_norm': '2.144', 'learning_rate': '1.497e-05', 'epoch': '0.02996', 'num_input_tokens_seen': 2435930, 'train_runtime': '1233', 'train_tokens_per_second': '1975'} +{'loss': '0.5446', 'grad_norm': '0.9213', 'learning_rate': '1.498e-05', 'epoch': '0.02999', 'num_input_tokens_seen': 2437977, 'train_runtime': '1234', 'train_tokens_per_second': '1975'} +{'loss': '1.812', 'grad_norm': '1.751', 'learning_rate': '1.499e-05', 'epoch': '0.03001', 'num_input_tokens_seen': 2440024, 'train_runtime': '1235', 'train_tokens_per_second': '1975'} +{'loss': '0.928', 'grad_norm': '1.585', 'learning_rate': '1.501e-05', 'epoch': '0.03004', 'num_input_tokens_seen': 2442071, 'train_runtime': '1236', 'train_tokens_per_second': '1975'} +{'loss': '2.589', 'grad_norm': '2.36', 'learning_rate': '1.502e-05', 'epoch': '0.03006', 'num_input_tokens_seen': 2444118, 'train_runtime': '1237', 'train_tokens_per_second': '1975'} +{'loss': '0.8027', 'grad_norm': '1.299', 'learning_rate': '1.503e-05', 'epoch': '0.03009', 'num_input_tokens_seen': 2446165, 'train_runtime': '1238', 'train_tokens_per_second': '1975'} +{'loss': '1.17', 'grad_norm': '1.606', 'learning_rate': '1.504e-05', 'epoch': '0.03011', 'num_input_tokens_seen': 2448212, 'train_runtime': '1239', 'train_tokens_per_second': '1975'} +{'loss': '1.439', 'grad_norm': '1.756', 'learning_rate': '1.506e-05', 'epoch': '0.03014', 'num_input_tokens_seen': 2450259, 'train_runtime': '1240', 'train_tokens_per_second': '1975'} +{'loss': '1.801', 'grad_norm': '1.949', 'learning_rate': '1.507e-05', 'epoch': '0.03016', 'num_input_tokens_seen': 2452306, 'train_runtime': '1241', 'train_tokens_per_second': '1975'} +{'loss': '1.137', 'grad_norm': '1.475', 'learning_rate': '1.508e-05', 'epoch': '0.03019', 'num_input_tokens_seen': 2454353, 'train_runtime': '1243', 'train_tokens_per_second': '1975'} +{'loss': '1.099', 'grad_norm': '1.429', 'learning_rate': '1.509e-05', 'epoch': '0.03021', 'num_input_tokens_seen': 2456400, 'train_runtime': '1244', 'train_tokens_per_second': '1975'} +{'loss': '0.8799', 'grad_norm': '1.224', 'learning_rate': '1.511e-05', 'epoch': '0.03024', 'num_input_tokens_seen': 2458447, 'train_runtime': '1245', 'train_tokens_per_second': '1975'} +{'loss': '2.368', 'grad_norm': '2.262', 'learning_rate': '1.512e-05', 'epoch': '0.03026', 'num_input_tokens_seen': 2460494, 'train_runtime': '1246', 'train_tokens_per_second': '1975'} +{'loss': '0.5899', 'grad_norm': '1.132', 'learning_rate': '1.513e-05', 'epoch': '0.03029', 'num_input_tokens_seen': 2462541, 'train_runtime': '1247', 'train_tokens_per_second': '1975'} +{'loss': '0.7816', 'grad_norm': '1.38', 'learning_rate': '1.514e-05', 'epoch': '0.03031', 'num_input_tokens_seen': 2464588, 'train_runtime': '1248', 'train_tokens_per_second': '1975'} +{'loss': '1.048', 'grad_norm': '1.389', 'learning_rate': '1.516e-05', 'epoch': '0.03034', 'num_input_tokens_seen': 2466635, 'train_runtime': '1249', 'train_tokens_per_second': '1975'} +{'loss': '0.7662', 'grad_norm': '1.42', 'learning_rate': '1.517e-05', 'epoch': '0.03036', 'num_input_tokens_seen': 2468682, 'train_runtime': '1250', 'train_tokens_per_second': '1975'} +{'loss': '0.9972', 'grad_norm': '1.499', 'learning_rate': '1.518e-05', 'epoch': '0.03039', 'num_input_tokens_seen': 2470729, 'train_runtime': '1251', 'train_tokens_per_second': '1975'} +{'loss': '0.5303', 'grad_norm': '1.125', 'learning_rate': '1.519e-05', 'epoch': '0.03042', 'num_input_tokens_seen': 2472776, 'train_runtime': '1252', 'train_tokens_per_second': '1975'} +{'loss': '0.6254', 'grad_norm': '1.199', 'learning_rate': '1.521e-05', 'epoch': '0.03044', 'num_input_tokens_seen': 2474823, 'train_runtime': '1253', 'train_tokens_per_second': '1975'} +{'loss': '0.7562', 'grad_norm': '1.199', 'learning_rate': '1.522e-05', 'epoch': '0.03047', 'num_input_tokens_seen': 2476870, 'train_runtime': '1254', 'train_tokens_per_second': '1975'} +{'loss': '1.107', 'grad_norm': '1.763', 'learning_rate': '1.523e-05', 'epoch': '0.03049', 'num_input_tokens_seen': 2478917, 'train_runtime': '1255', 'train_tokens_per_second': '1975'} +{'loss': '0.989', 'grad_norm': '1.541', 'learning_rate': '1.524e-05', 'epoch': '0.03052', 'num_input_tokens_seen': 2480964, 'train_runtime': '1256', 'train_tokens_per_second': '1975'} +{'loss': '1.446', 'grad_norm': '1.658', 'learning_rate': '1.526e-05', 'epoch': '0.03054', 'num_input_tokens_seen': 2483011, 'train_runtime': '1257', 'train_tokens_per_second': '1975'} +{'loss': '0.6535', 'grad_norm': '1.318', 'learning_rate': '1.527e-05', 'epoch': '0.03057', 'num_input_tokens_seen': 2485058, 'train_runtime': '1258', 'train_tokens_per_second': '1975'} +{'loss': '0.4715', 'grad_norm': '1.079', 'learning_rate': '1.528e-05', 'epoch': '0.03059', 'num_input_tokens_seen': 2487105, 'train_runtime': '1259', 'train_tokens_per_second': '1975'} +{'loss': '1.211', 'grad_norm': '1.901', 'learning_rate': '1.529e-05', 'epoch': '0.03062', 'num_input_tokens_seen': 2489152, 'train_runtime': '1260', 'train_tokens_per_second': '1975'} +{'loss': '0.6025', 'grad_norm': '1.152', 'learning_rate': '1.531e-05', 'epoch': '0.03064', 'num_input_tokens_seen': 2491199, 'train_runtime': '1261', 'train_tokens_per_second': '1975'} +{'loss': '0.8967', 'grad_norm': '1.394', 'learning_rate': '1.532e-05', 'epoch': '0.03067', 'num_input_tokens_seen': 2493246, 'train_runtime': '1262', 'train_tokens_per_second': '1975'} +{'loss': '2.353', 'grad_norm': '2.274', 'learning_rate': '1.533e-05', 'epoch': '0.03069', 'num_input_tokens_seen': 2495293, 'train_runtime': '1263', 'train_tokens_per_second': '1975'} +{'loss': '2.295', 'grad_norm': '2.091', 'learning_rate': '1.534e-05', 'epoch': '0.03072', 'num_input_tokens_seen': 2497340, 'train_runtime': '1264', 'train_tokens_per_second': '1975'} +{'loss': '0.4965', 'grad_norm': '0.8925', 'learning_rate': '1.536e-05', 'epoch': '0.03074', 'num_input_tokens_seen': 2499387, 'train_runtime': '1265', 'train_tokens_per_second': '1975'} +{'loss': '1.924', 'grad_norm': '1.968', 'learning_rate': '1.537e-05', 'epoch': '0.03077', 'num_input_tokens_seen': 2501434, 'train_runtime': '1266', 'train_tokens_per_second': '1975'} +{'loss': '1.643', 'grad_norm': '1.842', 'learning_rate': '1.538e-05', 'epoch': '0.03079', 'num_input_tokens_seen': 2503481, 'train_runtime': '1267', 'train_tokens_per_second': '1975'} +{'loss': '0.4257', 'grad_norm': '0.7954', 'learning_rate': '1.54e-05', 'epoch': '0.03082', 'num_input_tokens_seen': 2505528, 'train_runtime': '1268', 'train_tokens_per_second': '1975'} +{'loss': '1.211', 'grad_norm': '1.419', 'learning_rate': '1.541e-05', 'epoch': '0.03084', 'num_input_tokens_seen': 2507575, 'train_runtime': '1269', 'train_tokens_per_second': '1975'} +{'loss': '1.707', 'grad_norm': '2.349', 'learning_rate': '1.542e-05', 'epoch': '0.03087', 'num_input_tokens_seen': 2509622, 'train_runtime': '1270', 'train_tokens_per_second': '1975'} +{'loss': '1.396', 'grad_norm': '1.753', 'learning_rate': '1.543e-05', 'epoch': '0.03089', 'num_input_tokens_seen': 2511669, 'train_runtime': '1272', 'train_tokens_per_second': '1975'} +{'loss': '0.5534', 'grad_norm': '1.088', 'learning_rate': '1.545e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 2513716, 'train_runtime': '1273', 'train_tokens_per_second': '1975'} +{'loss': '1.025', 'grad_norm': '1.341', 'learning_rate': '1.546e-05', 'epoch': '0.03094', 'num_input_tokens_seen': 2515763, 'train_runtime': '1274', 'train_tokens_per_second': '1975'} +{'loss': '1.399', 'grad_norm': '1.855', 'learning_rate': '1.547e-05', 'epoch': '0.03097', 'num_input_tokens_seen': 2517810, 'train_runtime': '1275', 'train_tokens_per_second': '1975'} +{'loss': '2.182', 'grad_norm': '2.365', 'learning_rate': '1.548e-05', 'epoch': '0.03099', 'num_input_tokens_seen': 2519857, 'train_runtime': '1276', 'train_tokens_per_second': '1975'} +{'loss': '0.7387', 'grad_norm': '1.304', 'learning_rate': '1.55e-05', 'epoch': '0.03102', 'num_input_tokens_seen': 2521904, 'train_runtime': '1277', 'train_tokens_per_second': '1975'} +{'loss': '1.941', 'grad_norm': '2.127', 'learning_rate': '1.551e-05', 'epoch': '0.03104', 'num_input_tokens_seen': 2523951, 'train_runtime': '1278', 'train_tokens_per_second': '1975'} +{'loss': '1.384', 'grad_norm': '1.75', 'learning_rate': '1.552e-05', 'epoch': '0.03107', 'num_input_tokens_seen': 2525998, 'train_runtime': '1279', 'train_tokens_per_second': '1975'} +{'loss': '1.225', 'grad_norm': '1.837', 'learning_rate': '1.553e-05', 'epoch': '0.03109', 'num_input_tokens_seen': 2528045, 'train_runtime': '1280', 'train_tokens_per_second': '1975'} +{'loss': '1.294', 'grad_norm': '1.61', 'learning_rate': '1.555e-05', 'epoch': '0.03112', 'num_input_tokens_seen': 2530092, 'train_runtime': '1281', 'train_tokens_per_second': '1975'} +{'loss': '1.079', 'grad_norm': '1.782', 'learning_rate': '1.556e-05', 'epoch': '0.03115', 'num_input_tokens_seen': 2532139, 'train_runtime': '1282', 'train_tokens_per_second': '1975'} +{'loss': '1.155', 'grad_norm': '1.548', 'learning_rate': '1.557e-05', 'epoch': '0.03117', 'num_input_tokens_seen': 2534186, 'train_runtime': '1283', 'train_tokens_per_second': '1975'} +{'loss': '1.89', 'grad_norm': '1.945', 'learning_rate': '1.558e-05', 'epoch': '0.0312', 'num_input_tokens_seen': 2536233, 'train_runtime': '1284', 'train_tokens_per_second': '1975'} +{'loss': '0.5924', 'grad_norm': '1.292', 'learning_rate': '1.56e-05', 'epoch': '0.03122', 'num_input_tokens_seen': 2538280, 'train_runtime': '1285', 'train_tokens_per_second': '1975'} +{'loss': '0.5191', 'grad_norm': '1.174', 'learning_rate': '1.561e-05', 'epoch': '0.03125', 'num_input_tokens_seen': 2540327, 'train_runtime': '1286', 'train_tokens_per_second': '1975'} +{'loss': '0.443', 'grad_norm': '1.048', 'learning_rate': '1.562e-05', 'epoch': '0.03127', 'num_input_tokens_seen': 2542374, 'train_runtime': '1287', 'train_tokens_per_second': '1975'} +{'loss': '0.4818', 'grad_norm': '1.071', 'learning_rate': '1.563e-05', 'epoch': '0.0313', 'num_input_tokens_seen': 2544421, 'train_runtime': '1288', 'train_tokens_per_second': '1975'} +{'loss': '2.168', 'grad_norm': '2.18', 'learning_rate': '1.565e-05', 'epoch': '0.03132', 'num_input_tokens_seen': 2546468, 'train_runtime': '1289', 'train_tokens_per_second': '1975'} +{'loss': '1.725', 'grad_norm': '1.891', 'learning_rate': '1.566e-05', 'epoch': '0.03135', 'num_input_tokens_seen': 2548515, 'train_runtime': '1290', 'train_tokens_per_second': '1975'} +{'loss': '0.9605', 'grad_norm': '1.567', 'learning_rate': '1.567e-05', 'epoch': '0.03137', 'num_input_tokens_seen': 2550562, 'train_runtime': '1291', 'train_tokens_per_second': '1975'} +{'loss': '1.153', 'grad_norm': '1.557', 'learning_rate': '1.568e-05', 'epoch': '0.0314', 'num_input_tokens_seen': 2552609, 'train_runtime': '1292', 'train_tokens_per_second': '1975'} +{'loss': '1.14', 'grad_norm': '1.685', 'learning_rate': '1.57e-05', 'epoch': '0.03142', 'num_input_tokens_seen': 2554656, 'train_runtime': '1293', 'train_tokens_per_second': '1975'} +{'loss': '1.022', 'grad_norm': '1.864', 'learning_rate': '1.571e-05', 'epoch': '0.03145', 'num_input_tokens_seen': 2556703, 'train_runtime': '1294', 'train_tokens_per_second': '1975'} +{'loss': '1.491', 'grad_norm': '2.115', 'learning_rate': '1.572e-05', 'epoch': '0.03147', 'num_input_tokens_seen': 2558750, 'train_runtime': '1295', 'train_tokens_per_second': '1975'} +{'loss': '1.046', 'grad_norm': '1.472', 'learning_rate': '1.574e-05', 'epoch': '0.0315', 'num_input_tokens_seen': 2560797, 'train_runtime': '1296', 'train_tokens_per_second': '1975'} +{'loss': '0.4949', 'grad_norm': '1.164', 'learning_rate': '1.575e-05', 'epoch': '0.03152', 'num_input_tokens_seen': 2562844, 'train_runtime': '1297', 'train_tokens_per_second': '1975'} +{'loss': '0.6764', 'grad_norm': '1.396', 'learning_rate': '1.576e-05', 'epoch': '0.03155', 'num_input_tokens_seen': 2564891, 'train_runtime': '1298', 'train_tokens_per_second': '1975'} +{'loss': '0.7591', 'grad_norm': '1.382', 'learning_rate': '1.577e-05', 'epoch': '0.03157', 'num_input_tokens_seen': 2566938, 'train_runtime': '1299', 'train_tokens_per_second': '1975'} +{'loss': '0.5699', 'grad_norm': '1.252', 'learning_rate': '1.579e-05', 'epoch': '0.0316', 'num_input_tokens_seen': 2568985, 'train_runtime': '1300', 'train_tokens_per_second': '1975'} +{'loss': '0.8257', 'grad_norm': '1.349', 'learning_rate': '1.58e-05', 'epoch': '0.03162', 'num_input_tokens_seen': 2571032, 'train_runtime': '1302', 'train_tokens_per_second': '1975'} +{'loss': '1.018', 'grad_norm': '1.367', 'learning_rate': '1.581e-05', 'epoch': '0.03165', 'num_input_tokens_seen': 2573079, 'train_runtime': '1303', 'train_tokens_per_second': '1975'} +{'loss': '0.7563', 'grad_norm': '1.336', 'learning_rate': '1.582e-05', 'epoch': '0.03167', 'num_input_tokens_seen': 2575126, 'train_runtime': '1304', 'train_tokens_per_second': '1975'} +{'loss': '1.033', 'grad_norm': '1.714', 'learning_rate': '1.584e-05', 'epoch': '0.0317', 'num_input_tokens_seen': 2577173, 'train_runtime': '1305', 'train_tokens_per_second': '1975'} +{'loss': '0.7068', 'grad_norm': '1.282', 'learning_rate': '1.585e-05', 'epoch': '0.03172', 'num_input_tokens_seen': 2579220, 'train_runtime': '1306', 'train_tokens_per_second': '1975'} +{'loss': '0.5598', 'grad_norm': '1.234', 'learning_rate': '1.586e-05', 'epoch': '0.03175', 'num_input_tokens_seen': 2581267, 'train_runtime': '1307', 'train_tokens_per_second': '1975'} +{'loss': '2.503', 'grad_norm': '2.775', 'learning_rate': '1.587e-05', 'epoch': '0.03177', 'num_input_tokens_seen': 2583314, 'train_runtime': '1308', 'train_tokens_per_second': '1975'} +{'loss': '1.551', 'grad_norm': '2.367', 'learning_rate': '1.589e-05', 'epoch': '0.0318', 'num_input_tokens_seen': 2585361, 'train_runtime': '1309', 'train_tokens_per_second': '1975'} +{'loss': '1.05', 'grad_norm': '1.617', 'learning_rate': '1.59e-05', 'epoch': '0.03183', 'num_input_tokens_seen': 2587408, 'train_runtime': '1310', 'train_tokens_per_second': '1975'} +{'loss': '1.397', 'grad_norm': '2.269', 'learning_rate': '1.591e-05', 'epoch': '0.03185', 'num_input_tokens_seen': 2589455, 'train_runtime': '1311', 'train_tokens_per_second': '1975'} +{'loss': '1.703', 'grad_norm': '2.252', 'learning_rate': '1.592e-05', 'epoch': '0.03188', 'num_input_tokens_seen': 2591502, 'train_runtime': '1312', 'train_tokens_per_second': '1975'} +{'loss': '0.6695', 'grad_norm': '1.277', 'learning_rate': '1.594e-05', 'epoch': '0.0319', 'num_input_tokens_seen': 2593549, 'train_runtime': '1313', 'train_tokens_per_second': '1975'} +{'loss': '1.773', 'grad_norm': '2.013', 'learning_rate': '1.595e-05', 'epoch': '0.03193', 'num_input_tokens_seen': 2595596, 'train_runtime': '1314', 'train_tokens_per_second': '1975'} +{'loss': '0.5617', 'grad_norm': '1.137', 'learning_rate': '1.596e-05', 'epoch': '0.03195', 'num_input_tokens_seen': 2597643, 'train_runtime': '1315', 'train_tokens_per_second': '1975'} +{'loss': '2.14', 'grad_norm': '2.423', 'learning_rate': '1.597e-05', 'epoch': '0.03198', 'num_input_tokens_seen': 2599690, 'train_runtime': '1316', 'train_tokens_per_second': '1975'} +{'loss': '1.326', 'grad_norm': '1.888', 'learning_rate': '1.599e-05', 'epoch': '0.032', 'num_input_tokens_seen': 2601737, 'train_runtime': '1317', 'train_tokens_per_second': '1975'} +{'loss': '0.6476', 'grad_norm': '1.234', 'learning_rate': '1.6e-05', 'epoch': '0.03203', 'num_input_tokens_seen': 2603784, 'train_runtime': '1318', 'train_tokens_per_second': '1975'} +{'loss': '0.6627', 'grad_norm': '1.489', 'learning_rate': '1.601e-05', 'epoch': '0.03205', 'num_input_tokens_seen': 2605831, 'train_runtime': '1319', 'train_tokens_per_second': '1975'} +{'loss': '0.4599', 'grad_norm': '1.038', 'learning_rate': '1.602e-05', 'epoch': '0.03208', 'num_input_tokens_seen': 2607878, 'train_runtime': '1320', 'train_tokens_per_second': '1975'} +{'loss': '1.407', 'grad_norm': '1.878', 'learning_rate': '1.604e-05', 'epoch': '0.0321', 'num_input_tokens_seen': 2609925, 'train_runtime': '1321', 'train_tokens_per_second': '1975'} +{'loss': '0.5688', 'grad_norm': '1.332', 'learning_rate': '1.605e-05', 'epoch': '0.03213', 'num_input_tokens_seen': 2611972, 'train_runtime': '1322', 'train_tokens_per_second': '1975'} +{'loss': '1.425', 'grad_norm': '1.965', 'learning_rate': '1.606e-05', 'epoch': '0.03215', 'num_input_tokens_seen': 2614019, 'train_runtime': '1323', 'train_tokens_per_second': '1975'} +{'loss': '0.4897', 'grad_norm': '1.196', 'learning_rate': '1.608e-05', 'epoch': '0.03218', 'num_input_tokens_seen': 2616066, 'train_runtime': '1324', 'train_tokens_per_second': '1975'} +{'loss': '1.567', 'grad_norm': '1.885', 'learning_rate': '1.609e-05', 'epoch': '0.0322', 'num_input_tokens_seen': 2618113, 'train_runtime': '1325', 'train_tokens_per_second': '1975'} +{'loss': '1.975', 'grad_norm': '1.99', 'learning_rate': '1.61e-05', 'epoch': '0.03223', 'num_input_tokens_seen': 2620160, 'train_runtime': '1326', 'train_tokens_per_second': '1975'} +{'loss': '0.5707', 'grad_norm': '1.252', 'learning_rate': '1.611e-05', 'epoch': '0.03225', 'num_input_tokens_seen': 2622207, 'train_runtime': '1327', 'train_tokens_per_second': '1975'} +{'loss': '1.003', 'grad_norm': '1.376', 'learning_rate': '1.613e-05', 'epoch': '0.03228', 'num_input_tokens_seen': 2624254, 'train_runtime': '1328', 'train_tokens_per_second': '1975'} +{'loss': '1.171', 'grad_norm': '1.536', 'learning_rate': '1.614e-05', 'epoch': '0.0323', 'num_input_tokens_seen': 2626301, 'train_runtime': '1329', 'train_tokens_per_second': '1975'} +{'loss': '2.066', 'grad_norm': '2.384', 'learning_rate': '1.615e-05', 'epoch': '0.03233', 'num_input_tokens_seen': 2628348, 'train_runtime': '1331', 'train_tokens_per_second': '1975'} +{'loss': '1.373', 'grad_norm': '1.868', 'learning_rate': '1.616e-05', 'epoch': '0.03235', 'num_input_tokens_seen': 2630395, 'train_runtime': '1332', 'train_tokens_per_second': '1975'} +{'loss': '2.245', 'grad_norm': '2.319', 'learning_rate': '1.618e-05', 'epoch': '0.03238', 'num_input_tokens_seen': 2632442, 'train_runtime': '1333', 'train_tokens_per_second': '1975'} +{'loss': '1.162', 'grad_norm': '1.597', 'learning_rate': '1.619e-05', 'epoch': '0.0324', 'num_input_tokens_seen': 2634489, 'train_runtime': '1334', 'train_tokens_per_second': '1975'} +{'loss': '1.159', 'grad_norm': '1.731', 'learning_rate': '1.62e-05', 'epoch': '0.03243', 'num_input_tokens_seen': 2636536, 'train_runtime': '1335', 'train_tokens_per_second': '1975'} +{'loss': '0.9201', 'grad_norm': '1.517', 'learning_rate': '1.621e-05', 'epoch': '0.03245', 'num_input_tokens_seen': 2638583, 'train_runtime': '1336', 'train_tokens_per_second': '1975'} +{'loss': '0.5574', 'grad_norm': '1.032', 'learning_rate': '1.623e-05', 'epoch': '0.03248', 'num_input_tokens_seen': 2640630, 'train_runtime': '1337', 'train_tokens_per_second': '1975'} +{'loss': '1.67', 'grad_norm': '1.993', 'learning_rate': '1.624e-05', 'epoch': '0.0325', 'num_input_tokens_seen': 2642677, 'train_runtime': '1338', 'train_tokens_per_second': '1975'} +{'loss': '2.964', 'grad_norm': '1.819', 'learning_rate': '1.625e-05', 'epoch': '0.03253', 'num_input_tokens_seen': 2644724, 'train_runtime': '1339', 'train_tokens_per_second': '1975'} +{'loss': '0.5454', 'grad_norm': '1.17', 'learning_rate': '1.626e-05', 'epoch': '0.03256', 'num_input_tokens_seen': 2646771, 'train_runtime': '1340', 'train_tokens_per_second': '1975'} +{'loss': '0.7742', 'grad_norm': '1.502', 'learning_rate': '1.628e-05', 'epoch': '0.03258', 'num_input_tokens_seen': 2648818, 'train_runtime': '1341', 'train_tokens_per_second': '1975'} +{'loss': '0.5333', 'grad_norm': '1.098', 'learning_rate': '1.629e-05', 'epoch': '0.03261', 'num_input_tokens_seen': 2650865, 'train_runtime': '1342', 'train_tokens_per_second': '1975'} +{'loss': '0.5114', 'grad_norm': '0.9865', 'learning_rate': '1.63e-05', 'epoch': '0.03263', 'num_input_tokens_seen': 2652912, 'train_runtime': '1343', 'train_tokens_per_second': '1975'} +{'loss': '0.5716', 'grad_norm': '1.153', 'learning_rate': '1.631e-05', 'epoch': '0.03266', 'num_input_tokens_seen': 2654959, 'train_runtime': '1344', 'train_tokens_per_second': '1975'} +{'loss': '1.954', 'grad_norm': '2.005', 'learning_rate': '1.633e-05', 'epoch': '0.03268', 'num_input_tokens_seen': 2657006, 'train_runtime': '1345', 'train_tokens_per_second': '1975'} +{'loss': '1.154', 'grad_norm': '1.732', 'learning_rate': '1.634e-05', 'epoch': '0.03271', 'num_input_tokens_seen': 2659053, 'train_runtime': '1346', 'train_tokens_per_second': '1975'} +{'loss': '1.279', 'grad_norm': '1.981', 'learning_rate': '1.635e-05', 'epoch': '0.03273', 'num_input_tokens_seen': 2661100, 'train_runtime': '1347', 'train_tokens_per_second': '1975'} +{'loss': '2.152', 'grad_norm': '2.176', 'learning_rate': '1.636e-05', 'epoch': '0.03276', 'num_input_tokens_seen': 2663147, 'train_runtime': '1348', 'train_tokens_per_second': '1975'} +{'loss': '0.6705', 'grad_norm': '1.328', 'learning_rate': '1.638e-05', 'epoch': '0.03278', 'num_input_tokens_seen': 2665194, 'train_runtime': '1349', 'train_tokens_per_second': '1975'} +{'loss': '1.93', 'grad_norm': '2.162', 'learning_rate': '1.639e-05', 'epoch': '0.03281', 'num_input_tokens_seen': 2667241, 'train_runtime': '1351', 'train_tokens_per_second': '1975'} +{'loss': '0.6314', 'grad_norm': '1.218', 'learning_rate': '1.64e-05', 'epoch': '0.03283', 'num_input_tokens_seen': 2669288, 'train_runtime': '1352', 'train_tokens_per_second': '1975'} +{'loss': '0.47', 'grad_norm': '1.077', 'learning_rate': '1.641e-05', 'epoch': '0.03286', 'num_input_tokens_seen': 2671335, 'train_runtime': '1353', 'train_tokens_per_second': '1975'} +{'loss': '0.5522', 'grad_norm': '1.209', 'learning_rate': '1.643e-05', 'epoch': '0.03288', 'num_input_tokens_seen': 2673382, 'train_runtime': '1354', 'train_tokens_per_second': '1975'} +{'loss': '1.405', 'grad_norm': '1.885', 'learning_rate': '1.644e-05', 'epoch': '0.03291', 'num_input_tokens_seen': 2675429, 'train_runtime': '1355', 'train_tokens_per_second': '1975'} +{'loss': '1.109', 'grad_norm': '1.692', 'learning_rate': '1.645e-05', 'epoch': '0.03293', 'num_input_tokens_seen': 2677476, 'train_runtime': '1356', 'train_tokens_per_second': '1975'} +{'loss': '2.764', 'grad_norm': '2.317', 'learning_rate': '1.647e-05', 'epoch': '0.03296', 'num_input_tokens_seen': 2679523, 'train_runtime': '1357', 'train_tokens_per_second': '1975'} +{'loss': '1.065', 'grad_norm': '1.584', 'learning_rate': '1.648e-05', 'epoch': '0.03298', 'num_input_tokens_seen': 2681570, 'train_runtime': '1358', 'train_tokens_per_second': '1975'} +{'loss': '1.141', 'grad_norm': '1.704', 'learning_rate': '1.649e-05', 'epoch': '0.03301', 'num_input_tokens_seen': 2683617, 'train_runtime': '1359', 'train_tokens_per_second': '1975'} +{'loss': '0.9277', 'grad_norm': '1.587', 'learning_rate': '1.65e-05', 'epoch': '0.03303', 'num_input_tokens_seen': 2685664, 'train_runtime': '1360', 'train_tokens_per_second': '1975'} +{'loss': '0.8496', 'grad_norm': '1.484', 'learning_rate': '1.652e-05', 'epoch': '0.03306', 'num_input_tokens_seen': 2687711, 'train_runtime': '1361', 'train_tokens_per_second': '1975'} +{'loss': '0.6649', 'grad_norm': '1.393', 'learning_rate': '1.653e-05', 'epoch': '0.03308', 'num_input_tokens_seen': 2689758, 'train_runtime': '1362', 'train_tokens_per_second': '1975'} +{'loss': '0.6048', 'grad_norm': '1.52', 'learning_rate': '1.654e-05', 'epoch': '0.03311', 'num_input_tokens_seen': 2691805, 'train_runtime': '1363', 'train_tokens_per_second': '1975'} +{'loss': '0.9402', 'grad_norm': '1.467', 'learning_rate': '1.655e-05', 'epoch': '0.03313', 'num_input_tokens_seen': 2693852, 'train_runtime': '1364', 'train_tokens_per_second': '1975'} +{'loss': '1.249', 'grad_norm': '1.939', 'learning_rate': '1.657e-05', 'epoch': '0.03316', 'num_input_tokens_seen': 2695899, 'train_runtime': '1365', 'train_tokens_per_second': '1975'} +{'loss': '0.5079', 'grad_norm': '1.121', 'learning_rate': '1.658e-05', 'epoch': '0.03318', 'num_input_tokens_seen': 2697946, 'train_runtime': '1366', 'train_tokens_per_second': '1975'} +{'loss': '1.365', 'grad_norm': '2.14', 'learning_rate': '1.659e-05', 'epoch': '0.03321', 'num_input_tokens_seen': 2699993, 'train_runtime': '1367', 'train_tokens_per_second': '1975'} +{'loss': '1.716', 'grad_norm': '2.001', 'learning_rate': '1.66e-05', 'epoch': '0.03324', 'num_input_tokens_seen': 2702040, 'train_runtime': '1368', 'train_tokens_per_second': '1975'} +{'loss': '0.554', 'grad_norm': '1.264', 'learning_rate': '1.662e-05', 'epoch': '0.03326', 'num_input_tokens_seen': 2704087, 'train_runtime': '1369', 'train_tokens_per_second': '1975'} +{'loss': '0.7407', 'grad_norm': '1.18', 'learning_rate': '1.663e-05', 'epoch': '0.03329', 'num_input_tokens_seen': 2706134, 'train_runtime': '1370', 'train_tokens_per_second': '1975'} +{'loss': '0.8566', 'grad_norm': '1.629', 'learning_rate': '1.664e-05', 'epoch': '0.03331', 'num_input_tokens_seen': 2708181, 'train_runtime': '1371', 'train_tokens_per_second': '1975'} +{'loss': '1.04', 'grad_norm': '1.89', 'learning_rate': '1.665e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 2710228, 'train_runtime': '1372', 'train_tokens_per_second': '1975'} +{'loss': '0.6722', 'grad_norm': '1.585', 'learning_rate': '1.667e-05', 'epoch': '0.03336', 'num_input_tokens_seen': 2712275, 'train_runtime': '1373', 'train_tokens_per_second': '1975'} +{'loss': '0.4606', 'grad_norm': '1.15', 'learning_rate': '1.668e-05', 'epoch': '0.03339', 'num_input_tokens_seen': 2714322, 'train_runtime': '1374', 'train_tokens_per_second': '1975'} +{'loss': '1.361', 'grad_norm': '2.021', 'learning_rate': '1.669e-05', 'epoch': '0.03341', 'num_input_tokens_seen': 2716369, 'train_runtime': '1375', 'train_tokens_per_second': '1975'} +{'loss': '0.8359', 'grad_norm': '1.595', 'learning_rate': '1.67e-05', 'epoch': '0.03344', 'num_input_tokens_seen': 2718416, 'train_runtime': '1376', 'train_tokens_per_second': '1975'} +{'loss': '0.5663', 'grad_norm': '1.414', 'learning_rate': '1.672e-05', 'epoch': '0.03346', 'num_input_tokens_seen': 2720463, 'train_runtime': '1377', 'train_tokens_per_second': '1975'} +{'loss': '0.4416', 'grad_norm': '1.244', 'learning_rate': '1.673e-05', 'epoch': '0.03349', 'num_input_tokens_seen': 2722510, 'train_runtime': '1378', 'train_tokens_per_second': '1975'} +{'loss': '1.062', 'grad_norm': '1.914', 'learning_rate': '1.674e-05', 'epoch': '0.03351', 'num_input_tokens_seen': 2724557, 'train_runtime': '1379', 'train_tokens_per_second': '1975'} +{'loss': '1.78', 'grad_norm': '2.369', 'learning_rate': '1.675e-05', 'epoch': '0.03354', 'num_input_tokens_seen': 2726604, 'train_runtime': '1381', 'train_tokens_per_second': '1975'} +{'loss': '0.5946', 'grad_norm': '1.287', 'learning_rate': '1.677e-05', 'epoch': '0.03356', 'num_input_tokens_seen': 2728651, 'train_runtime': '1382', 'train_tokens_per_second': '1975'} +{'loss': '1.247', 'grad_norm': '1.807', 'learning_rate': '1.678e-05', 'epoch': '0.03359', 'num_input_tokens_seen': 2730698, 'train_runtime': '1383', 'train_tokens_per_second': '1975'} +{'loss': '3.607', 'grad_norm': '2.857', 'learning_rate': '1.679e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 2732745, 'train_runtime': '1384', 'train_tokens_per_second': '1975'} +{'loss': '1.383', 'grad_norm': '1.879', 'learning_rate': '1.681e-05', 'epoch': '0.03364', 'num_input_tokens_seen': 2734792, 'train_runtime': '1385', 'train_tokens_per_second': '1975'} +{'loss': '0.5078', 'grad_norm': '1.229', 'learning_rate': '1.682e-05', 'epoch': '0.03366', 'num_input_tokens_seen': 2736839, 'train_runtime': '1386', 'train_tokens_per_second': '1975'} +{'loss': '1.102', 'grad_norm': '1.71', 'learning_rate': '1.683e-05', 'epoch': '0.03369', 'num_input_tokens_seen': 2738886, 'train_runtime': '1387', 'train_tokens_per_second': '1975'} +{'loss': '1.423', 'grad_norm': '2.113', 'learning_rate': '1.684e-05', 'epoch': '0.03371', 'num_input_tokens_seen': 2740933, 'train_runtime': '1388', 'train_tokens_per_second': '1975'} +{'loss': '1.617', 'grad_norm': '2.04', 'learning_rate': '1.686e-05', 'epoch': '0.03374', 'num_input_tokens_seen': 2742980, 'train_runtime': '1389', 'train_tokens_per_second': '1975'} +{'loss': '0.7746', 'grad_norm': '1.658', 'learning_rate': '1.687e-05', 'epoch': '0.03376', 'num_input_tokens_seen': 2745027, 'train_runtime': '1390', 'train_tokens_per_second': '1975'} +{'loss': '0.9626', 'grad_norm': '1.551', 'learning_rate': '1.688e-05', 'epoch': '0.03379', 'num_input_tokens_seen': 2747074, 'train_runtime': '1391', 'train_tokens_per_second': '1975'} +{'loss': '1.95', 'grad_norm': '2.289', 'learning_rate': '1.689e-05', 'epoch': '0.03381', 'num_input_tokens_seen': 2749121, 'train_runtime': '1392', 'train_tokens_per_second': '1975'} +{'loss': '1.23', 'grad_norm': '1.65', 'learning_rate': '1.691e-05', 'epoch': '0.03384', 'num_input_tokens_seen': 2751168, 'train_runtime': '1393', 'train_tokens_per_second': '1975'} +{'loss': '1.025', 'grad_norm': '1.476', 'learning_rate': '1.692e-05', 'epoch': '0.03386', 'num_input_tokens_seen': 2753215, 'train_runtime': '1394', 'train_tokens_per_second': '1975'} +{'loss': '1.278', 'grad_norm': '1.998', 'learning_rate': '1.693e-05', 'epoch': '0.03389', 'num_input_tokens_seen': 2755262, 'train_runtime': '1395', 'train_tokens_per_second': '1975'} +{'loss': '1.281', 'grad_norm': '2.01', 'learning_rate': '1.694e-05', 'epoch': '0.03391', 'num_input_tokens_seen': 2757309, 'train_runtime': '1396', 'train_tokens_per_second': '1975'} +{'loss': '1.201', 'grad_norm': '1.824', 'learning_rate': '1.696e-05', 'epoch': '0.03394', 'num_input_tokens_seen': 2759356, 'train_runtime': '1397', 'train_tokens_per_second': '1975'} +{'loss': '0.8889', 'grad_norm': '1.377', 'learning_rate': '1.697e-05', 'epoch': '0.03397', 'num_input_tokens_seen': 2761403, 'train_runtime': '1398', 'train_tokens_per_second': '1975'} +{'loss': '1.491', 'grad_norm': '2.233', 'learning_rate': '1.698e-05', 'epoch': '0.03399', 'num_input_tokens_seen': 2763450, 'train_runtime': '1399', 'train_tokens_per_second': '1975'} +{'loss': '0.8327', 'grad_norm': '1.624', 'learning_rate': '1.699e-05', 'epoch': '0.03402', 'num_input_tokens_seen': 2765497, 'train_runtime': '1400', 'train_tokens_per_second': '1975'} +{'loss': '0.6443', 'grad_norm': '1.422', 'learning_rate': '1.701e-05', 'epoch': '0.03404', 'num_input_tokens_seen': 2767544, 'train_runtime': '1401', 'train_tokens_per_second': '1975'} +{'loss': '0.4775', 'grad_norm': '1.214', 'learning_rate': '1.702e-05', 'epoch': '0.03407', 'num_input_tokens_seen': 2769591, 'train_runtime': '1402', 'train_tokens_per_second': '1975'} +{'loss': '1.216', 'grad_norm': '1.687', 'learning_rate': '1.703e-05', 'epoch': '0.03409', 'num_input_tokens_seen': 2771638, 'train_runtime': '1403', 'train_tokens_per_second': '1975'} +{'loss': '0.4078', 'grad_norm': '1.036', 'learning_rate': '1.704e-05', 'epoch': '0.03412', 'num_input_tokens_seen': 2773685, 'train_runtime': '1404', 'train_tokens_per_second': '1975'} +{'loss': '0.9739', 'grad_norm': '1.678', 'learning_rate': '1.706e-05', 'epoch': '0.03414', 'num_input_tokens_seen': 2775732, 'train_runtime': '1405', 'train_tokens_per_second': '1975'} +{'loss': '0.8372', 'grad_norm': '1.594', 'learning_rate': '1.707e-05', 'epoch': '0.03417', 'num_input_tokens_seen': 2777779, 'train_runtime': '1406', 'train_tokens_per_second': '1975'} +{'loss': '1.824', 'grad_norm': '2.265', 'learning_rate': '1.708e-05', 'epoch': '0.03419', 'num_input_tokens_seen': 2779826, 'train_runtime': '1407', 'train_tokens_per_second': '1975'} +{'loss': '1.494', 'grad_norm': '2.211', 'learning_rate': '1.709e-05', 'epoch': '0.03422', 'num_input_tokens_seen': 2781873, 'train_runtime': '1408', 'train_tokens_per_second': '1975'} +{'loss': '0.5735', 'grad_norm': '1.298', 'learning_rate': '1.711e-05', 'epoch': '0.03424', 'num_input_tokens_seen': 2783920, 'train_runtime': '1409', 'train_tokens_per_second': '1975'} +{'loss': '0.3931', 'grad_norm': '1.111', 'learning_rate': '1.712e-05', 'epoch': '0.03427', 'num_input_tokens_seen': 2785967, 'train_runtime': '1410', 'train_tokens_per_second': '1975'} +{'loss': '1.059', 'grad_norm': '1.649', 'learning_rate': '1.713e-05', 'epoch': '0.03429', 'num_input_tokens_seen': 2788014, 'train_runtime': '1411', 'train_tokens_per_second': '1975'} +{'loss': '0.943', 'grad_norm': '1.937', 'learning_rate': '1.715e-05', 'epoch': '0.03432', 'num_input_tokens_seen': 2790061, 'train_runtime': '1413', 'train_tokens_per_second': '1975'} +{'loss': '1.726', 'grad_norm': '2.379', 'learning_rate': '1.716e-05', 'epoch': '0.03434', 'num_input_tokens_seen': 2792108, 'train_runtime': '1414', 'train_tokens_per_second': '1975'} +{'loss': '1.205', 'grad_norm': '1.904', 'learning_rate': '1.717e-05', 'epoch': '0.03437', 'num_input_tokens_seen': 2794155, 'train_runtime': '1415', 'train_tokens_per_second': '1975'} +{'loss': '0.8399', 'grad_norm': '1.725', 'learning_rate': '1.718e-05', 'epoch': '0.03439', 'num_input_tokens_seen': 2796202, 'train_runtime': '1416', 'train_tokens_per_second': '1975'} +{'loss': '1.324', 'grad_norm': '1.856', 'learning_rate': '1.72e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 2798249, 'train_runtime': '1417', 'train_tokens_per_second': '1975'} +{'loss': '2.775', 'grad_norm': '2.855', 'learning_rate': '1.721e-05', 'epoch': '0.03444', 'num_input_tokens_seen': 2800296, 'train_runtime': '1418', 'train_tokens_per_second': '1975'} +{'loss': '0.792', 'grad_norm': '1.628', 'learning_rate': '1.722e-05', 'epoch': '0.03447', 'num_input_tokens_seen': 2802343, 'train_runtime': '1419', 'train_tokens_per_second': '1975'} +{'loss': '1.445', 'grad_norm': '1.828', 'learning_rate': '1.723e-05', 'epoch': '0.03449', 'num_input_tokens_seen': 2804390, 'train_runtime': '1420', 'train_tokens_per_second': '1975'} +{'loss': '0.5453', 'grad_norm': '1.685', 'learning_rate': '1.725e-05', 'epoch': '0.03452', 'num_input_tokens_seen': 2806437, 'train_runtime': '1421', 'train_tokens_per_second': '1975'} +{'loss': '1.925', 'grad_norm': '2.227', 'learning_rate': '1.726e-05', 'epoch': '0.03454', 'num_input_tokens_seen': 2808484, 'train_runtime': '1422', 'train_tokens_per_second': '1975'} +{'loss': '0.6132', 'grad_norm': '1.374', 'learning_rate': '1.727e-05', 'epoch': '0.03457', 'num_input_tokens_seen': 2810531, 'train_runtime': '1423', 'train_tokens_per_second': '1975'} +{'loss': '2.056', 'grad_norm': '1.972', 'learning_rate': '1.728e-05', 'epoch': '0.03459', 'num_input_tokens_seen': 2812578, 'train_runtime': '1424', 'train_tokens_per_second': '1975'} +{'loss': '1.246', 'grad_norm': '1.824', 'learning_rate': '1.73e-05', 'epoch': '0.03462', 'num_input_tokens_seen': 2814625, 'train_runtime': '1425', 'train_tokens_per_second': '1975'} +{'loss': '0.9486', 'grad_norm': '1.807', 'learning_rate': '1.731e-05', 'epoch': '0.03465', 'num_input_tokens_seen': 2816672, 'train_runtime': '1426', 'train_tokens_per_second': '1975'} +{'loss': '1.608', 'grad_norm': '1.953', 'learning_rate': '1.732e-05', 'epoch': '0.03467', 'num_input_tokens_seen': 2818719, 'train_runtime': '1427', 'train_tokens_per_second': '1975'} +{'loss': '2.003', 'grad_norm': '2.332', 'learning_rate': '1.733e-05', 'epoch': '0.0347', 'num_input_tokens_seen': 2820766, 'train_runtime': '1428', 'train_tokens_per_second': '1975'} +{'loss': '1.129', 'grad_norm': '2.001', 'learning_rate': '1.735e-05', 'epoch': '0.03472', 'num_input_tokens_seen': 2822813, 'train_runtime': '1429', 'train_tokens_per_second': '1975'} +{'loss': '1.289', 'grad_norm': '2.038', 'learning_rate': '1.736e-05', 'epoch': '0.03475', 'num_input_tokens_seen': 2824860, 'train_runtime': '1430', 'train_tokens_per_second': '1975'} +{'loss': '1.146', 'grad_norm': '1.657', 'learning_rate': '1.737e-05', 'epoch': '0.03477', 'num_input_tokens_seen': 2826907, 'train_runtime': '1431', 'train_tokens_per_second': '1975'} +{'loss': '0.5207', 'grad_norm': '1.382', 'learning_rate': '1.738e-05', 'epoch': '0.0348', 'num_input_tokens_seen': 2828954, 'train_runtime': '1432', 'train_tokens_per_second': '1975'} +{'loss': '1.992', 'grad_norm': '2.748', 'learning_rate': '1.74e-05', 'epoch': '0.03482', 'num_input_tokens_seen': 2831001, 'train_runtime': '1433', 'train_tokens_per_second': '1975'} +{'loss': '1.514', 'grad_norm': '1.876', 'learning_rate': '1.741e-05', 'epoch': '0.03485', 'num_input_tokens_seen': 2833048, 'train_runtime': '1434', 'train_tokens_per_second': '1975'} +{'loss': '1.058', 'grad_norm': '1.906', 'learning_rate': '1.742e-05', 'epoch': '0.03487', 'num_input_tokens_seen': 2835095, 'train_runtime': '1435', 'train_tokens_per_second': '1975'} +{'loss': '0.475', 'grad_norm': '1.293', 'learning_rate': '1.743e-05', 'epoch': '0.0349', 'num_input_tokens_seen': 2837142, 'train_runtime': '1436', 'train_tokens_per_second': '1975'} +{'loss': '1.161', 'grad_norm': '2.05', 'learning_rate': '1.745e-05', 'epoch': '0.03492', 'num_input_tokens_seen': 2839189, 'train_runtime': '1437', 'train_tokens_per_second': '1975'} +{'loss': '2.138', 'grad_norm': '2.653', 'learning_rate': '1.746e-05', 'epoch': '0.03495', 'num_input_tokens_seen': 2841236, 'train_runtime': '1438', 'train_tokens_per_second': '1975'} +{'loss': '1.991', 'grad_norm': '2.867', 'learning_rate': '1.747e-05', 'epoch': '0.03497', 'num_input_tokens_seen': 2843283, 'train_runtime': '1439', 'train_tokens_per_second': '1975'} +{'loss': '0.6344', 'grad_norm': '1.673', 'learning_rate': '1.748e-05', 'epoch': '0.035', 'num_input_tokens_seen': 2845330, 'train_runtime': '1440', 'train_tokens_per_second': '1975'} +{'loss': '0.5256', 'grad_norm': '1.139', 'learning_rate': '1.75e-05', 'epoch': '0.03502', 'num_input_tokens_seen': 2847377, 'train_runtime': '1441', 'train_tokens_per_second': '1975'} +{'loss': '1.142', 'grad_norm': '2.293', 'learning_rate': '1.751e-05', 'epoch': '0.03505', 'num_input_tokens_seen': 2849424, 'train_runtime': '1442', 'train_tokens_per_second': '1975'} +{'loss': '0.52', 'grad_norm': '1.444', 'learning_rate': '1.752e-05', 'epoch': '0.03507', 'num_input_tokens_seen': 2851471, 'train_runtime': '1443', 'train_tokens_per_second': '1975'} +{'loss': '1.766', 'grad_norm': '2.307', 'learning_rate': '1.754e-05', 'epoch': '0.0351', 'num_input_tokens_seen': 2853518, 'train_runtime': '1445', 'train_tokens_per_second': '1975'} +{'loss': '0.4696', 'grad_norm': '1.349', 'learning_rate': '1.755e-05', 'epoch': '0.03512', 'num_input_tokens_seen': 2855565, 'train_runtime': '1446', 'train_tokens_per_second': '1975'} +{'loss': '1.881', 'grad_norm': '2.303', 'learning_rate': '1.756e-05', 'epoch': '0.03515', 'num_input_tokens_seen': 2857612, 'train_runtime': '1447', 'train_tokens_per_second': '1975'} +{'loss': '0.4405', 'grad_norm': '1.401', 'learning_rate': '1.757e-05', 'epoch': '0.03517', 'num_input_tokens_seen': 2859659, 'train_runtime': '1448', 'train_tokens_per_second': '1975'} +{'loss': '0.5864', 'grad_norm': '1.41', 'learning_rate': '1.759e-05', 'epoch': '0.0352', 'num_input_tokens_seen': 2861706, 'train_runtime': '1449', 'train_tokens_per_second': '1975'} +{'loss': '0.8654', 'grad_norm': '1.612', 'learning_rate': '1.76e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 2863753, 'train_runtime': '1450', 'train_tokens_per_second': '1975'} +{'loss': '1.446', 'grad_norm': '2.058', 'learning_rate': '1.761e-05', 'epoch': '0.03525', 'num_input_tokens_seen': 2865800, 'train_runtime': '1451', 'train_tokens_per_second': '1975'} +{'loss': '1.07', 'grad_norm': '1.933', 'learning_rate': '1.762e-05', 'epoch': '0.03527', 'num_input_tokens_seen': 2867847, 'train_runtime': '1452', 'train_tokens_per_second': '1975'} +{'loss': '1.851', 'grad_norm': '2.319', 'learning_rate': '1.764e-05', 'epoch': '0.0353', 'num_input_tokens_seen': 2869894, 'train_runtime': '1453', 'train_tokens_per_second': '1975'} +{'loss': '1.279', 'grad_norm': '2.397', 'learning_rate': '1.765e-05', 'epoch': '0.03532', 'num_input_tokens_seen': 2871941, 'train_runtime': '1454', 'train_tokens_per_second': '1975'} +{'loss': '1.053', 'grad_norm': '1.652', 'learning_rate': '1.766e-05', 'epoch': '0.03535', 'num_input_tokens_seen': 2873988, 'train_runtime': '1455', 'train_tokens_per_second': '1975'} +{'loss': '0.438', 'grad_norm': '1.169', 'learning_rate': '1.767e-05', 'epoch': '0.03538', 'num_input_tokens_seen': 2876035, 'train_runtime': '1456', 'train_tokens_per_second': '1975'} +{'loss': '0.5597', 'grad_norm': '1.336', 'learning_rate': '1.769e-05', 'epoch': '0.0354', 'num_input_tokens_seen': 2878082, 'train_runtime': '1457', 'train_tokens_per_second': '1975'} +{'loss': '0.4852', 'grad_norm': '1.269', 'learning_rate': '1.77e-05', 'epoch': '0.03543', 'num_input_tokens_seen': 2880129, 'train_runtime': '1458', 'train_tokens_per_second': '1976'} +{'loss': '1.831', 'grad_norm': '2.7', 'learning_rate': '1.771e-05', 'epoch': '0.03545', 'num_input_tokens_seen': 2882176, 'train_runtime': '1459', 'train_tokens_per_second': '1976'} +{'loss': '2.82', 'grad_norm': '2.815', 'learning_rate': '1.772e-05', 'epoch': '0.03548', 'num_input_tokens_seen': 2884223, 'train_runtime': '1460', 'train_tokens_per_second': '1976'} +{'loss': '0.6105', 'grad_norm': '1.639', 'learning_rate': '1.774e-05', 'epoch': '0.0355', 'num_input_tokens_seen': 2886270, 'train_runtime': '1461', 'train_tokens_per_second': '1976'} +{'loss': '1.338', 'grad_norm': '2.222', 'learning_rate': '1.775e-05', 'epoch': '0.03553', 'num_input_tokens_seen': 2888317, 'train_runtime': '1462', 'train_tokens_per_second': '1976'} +{'loss': '0.7932', 'grad_norm': '1.656', 'learning_rate': '1.776e-05', 'epoch': '0.03555', 'num_input_tokens_seen': 2890364, 'train_runtime': '1463', 'train_tokens_per_second': '1976'} +{'loss': '1.102', 'grad_norm': '1.602', 'learning_rate': '1.777e-05', 'epoch': '0.03558', 'num_input_tokens_seen': 2892411, 'train_runtime': '1464', 'train_tokens_per_second': '1976'} +{'loss': '1.094', 'grad_norm': '1.511', 'learning_rate': '1.779e-05', 'epoch': '0.0356', 'num_input_tokens_seen': 2894458, 'train_runtime': '1465', 'train_tokens_per_second': '1976'} +{'loss': '0.764', 'grad_norm': '1.994', 'learning_rate': '1.78e-05', 'epoch': '0.03563', 'num_input_tokens_seen': 2896505, 'train_runtime': '1466', 'train_tokens_per_second': '1976'} +{'loss': '0.4766', 'grad_norm': '1.234', 'learning_rate': '1.781e-05', 'epoch': '0.03565', 'num_input_tokens_seen': 2898552, 'train_runtime': '1467', 'train_tokens_per_second': '1976'} +{'loss': '0.9696', 'grad_norm': '1.665', 'learning_rate': '1.782e-05', 'epoch': '0.03568', 'num_input_tokens_seen': 2900599, 'train_runtime': '1468', 'train_tokens_per_second': '1976'} +{'loss': '1.632', 'grad_norm': '2.076', 'learning_rate': '1.784e-05', 'epoch': '0.0357', 'num_input_tokens_seen': 2902646, 'train_runtime': '1469', 'train_tokens_per_second': '1976'} +{'loss': '2.699', 'grad_norm': '2.94', 'learning_rate': '1.785e-05', 'epoch': '0.03573', 'num_input_tokens_seen': 2904693, 'train_runtime': '1470', 'train_tokens_per_second': '1976'} +{'loss': '1.486', 'grad_norm': '1.929', 'learning_rate': '1.786e-05', 'epoch': '0.03575', 'num_input_tokens_seen': 2906740, 'train_runtime': '1471', 'train_tokens_per_second': '1976'} +{'loss': '0.5287', 'grad_norm': '1.11', 'learning_rate': '1.788e-05', 'epoch': '0.03578', 'num_input_tokens_seen': 2908787, 'train_runtime': '1472', 'train_tokens_per_second': '1976'} +{'loss': '0.741', 'grad_norm': '1.573', 'learning_rate': '1.789e-05', 'epoch': '0.0358', 'num_input_tokens_seen': 2910834, 'train_runtime': '1473', 'train_tokens_per_second': '1976'} +{'loss': '1.497', 'grad_norm': '2.394', 'learning_rate': '1.79e-05', 'epoch': '0.03583', 'num_input_tokens_seen': 2912881, 'train_runtime': '1474', 'train_tokens_per_second': '1976'} +{'loss': '0.8666', 'grad_norm': '1.634', 'learning_rate': '1.791e-05', 'epoch': '0.03585', 'num_input_tokens_seen': 2914928, 'train_runtime': '1475', 'train_tokens_per_second': '1976'} +{'loss': '0.5435', 'grad_norm': '1.587', 'learning_rate': '1.793e-05', 'epoch': '0.03588', 'num_input_tokens_seen': 2916975, 'train_runtime': '1476', 'train_tokens_per_second': '1976'} +{'loss': '1.867', 'grad_norm': '2.659', 'learning_rate': '1.794e-05', 'epoch': '0.0359', 'num_input_tokens_seen': 2919022, 'train_runtime': '1478', 'train_tokens_per_second': '1976'} +{'loss': '0.5289', 'grad_norm': '1.137', 'learning_rate': '1.795e-05', 'epoch': '0.03593', 'num_input_tokens_seen': 2921069, 'train_runtime': '1479', 'train_tokens_per_second': '1976'} +{'loss': '0.4824', 'grad_norm': '1.395', 'learning_rate': '1.796e-05', 'epoch': '0.03595', 'num_input_tokens_seen': 2923116, 'train_runtime': '1480', 'train_tokens_per_second': '1976'} +{'loss': '0.5261', 'grad_norm': '1.362', 'learning_rate': '1.798e-05', 'epoch': '0.03598', 'num_input_tokens_seen': 2925163, 'train_runtime': '1481', 'train_tokens_per_second': '1976'} +{'loss': '1.89', 'grad_norm': '2.46', 'learning_rate': '1.799e-05', 'epoch': '0.036', 'num_input_tokens_seen': 2927210, 'train_runtime': '1482', 'train_tokens_per_second': '1976'} +{'loss': '0.6179', 'grad_norm': '1.474', 'learning_rate': '1.8e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 2929257, 'train_runtime': '1483', 'train_tokens_per_second': '1976'} +{'loss': '1.431', 'grad_norm': '1.988', 'learning_rate': '1.801e-05', 'epoch': '0.03606', 'num_input_tokens_seen': 2931304, 'train_runtime': '1484', 'train_tokens_per_second': '1976'} +{'loss': '0.8454', 'grad_norm': '1.457', 'learning_rate': '1.803e-05', 'epoch': '0.03608', 'num_input_tokens_seen': 2933351, 'train_runtime': '1485', 'train_tokens_per_second': '1976'} +{'loss': '0.869', 'grad_norm': '1.683', 'learning_rate': '1.804e-05', 'epoch': '0.03611', 'num_input_tokens_seen': 2935398, 'train_runtime': '1486', 'train_tokens_per_second': '1976'} +{'loss': '0.8285', 'grad_norm': '1.435', 'learning_rate': '1.805e-05', 'epoch': '0.03613', 'num_input_tokens_seen': 2937445, 'train_runtime': '1487', 'train_tokens_per_second': '1976'} +{'loss': '1.771', 'grad_norm': '2.208', 'learning_rate': '1.806e-05', 'epoch': '0.03616', 'num_input_tokens_seen': 2939492, 'train_runtime': '1488', 'train_tokens_per_second': '1976'} +{'loss': '0.3534', 'grad_norm': '1.199', 'learning_rate': '1.808e-05', 'epoch': '0.03618', 'num_input_tokens_seen': 2941539, 'train_runtime': '1489', 'train_tokens_per_second': '1976'} +{'loss': '1.44', 'grad_norm': '3.639', 'learning_rate': '1.809e-05', 'epoch': '0.03621', 'num_input_tokens_seen': 2943586, 'train_runtime': '1490', 'train_tokens_per_second': '1976'} +{'loss': '0.9386', 'grad_norm': '1.812', 'learning_rate': '1.81e-05', 'epoch': '0.03623', 'num_input_tokens_seen': 2945633, 'train_runtime': '1491', 'train_tokens_per_second': '1976'} +{'loss': '1.386', 'grad_norm': '2.234', 'learning_rate': '1.811e-05', 'epoch': '0.03626', 'num_input_tokens_seen': 2947680, 'train_runtime': '1492', 'train_tokens_per_second': '1976'} +{'loss': '0.515', 'grad_norm': '1.116', 'learning_rate': '1.813e-05', 'epoch': '0.03628', 'num_input_tokens_seen': 2949727, 'train_runtime': '1493', 'train_tokens_per_second': '1976'} +{'loss': '1.953', 'grad_norm': '2.39', 'learning_rate': '1.814e-05', 'epoch': '0.03631', 'num_input_tokens_seen': 2951774, 'train_runtime': '1494', 'train_tokens_per_second': '1976'} +{'loss': '0.6092', 'grad_norm': '1.473', 'learning_rate': '1.815e-05', 'epoch': '0.03633', 'num_input_tokens_seen': 2953821, 'train_runtime': '1495', 'train_tokens_per_second': '1976'} +{'loss': '1.187', 'grad_norm': '1.89', 'learning_rate': '1.816e-05', 'epoch': '0.03636', 'num_input_tokens_seen': 2955868, 'train_runtime': '1496', 'train_tokens_per_second': '1976'} +{'loss': '1.088', 'grad_norm': '1.7', 'learning_rate': '1.818e-05', 'epoch': '0.03638', 'num_input_tokens_seen': 2957915, 'train_runtime': '1497', 'train_tokens_per_second': '1976'} +{'loss': '0.6202', 'grad_norm': '1.478', 'learning_rate': '1.819e-05', 'epoch': '0.03641', 'num_input_tokens_seen': 2959962, 'train_runtime': '1498', 'train_tokens_per_second': '1976'} +{'loss': '0.6083', 'grad_norm': '1.234', 'learning_rate': '1.82e-05', 'epoch': '0.03643', 'num_input_tokens_seen': 2962009, 'train_runtime': '1499', 'train_tokens_per_second': '1976'} +{'loss': '0.433', 'grad_norm': '1.263', 'learning_rate': '1.822e-05', 'epoch': '0.03646', 'num_input_tokens_seen': 2964056, 'train_runtime': '1500', 'train_tokens_per_second': '1976'} +{'loss': '0.5335', 'grad_norm': '1.431', 'learning_rate': '1.823e-05', 'epoch': '0.03648', 'num_input_tokens_seen': 2966103, 'train_runtime': '1501', 'train_tokens_per_second': '1976'} +{'loss': '0.5075', 'grad_norm': '1.323', 'learning_rate': '1.824e-05', 'epoch': '0.03651', 'num_input_tokens_seen': 2968150, 'train_runtime': '1502', 'train_tokens_per_second': '1976'} +{'loss': '1.036', 'grad_norm': '1.841', 'learning_rate': '1.825e-05', 'epoch': '0.03653', 'num_input_tokens_seen': 2970197, 'train_runtime': '1503', 'train_tokens_per_second': '1976'} +{'loss': '1.095', 'grad_norm': '3.215', 'learning_rate': '1.827e-05', 'epoch': '0.03656', 'num_input_tokens_seen': 2972244, 'train_runtime': '1504', 'train_tokens_per_second': '1976'} +{'loss': '1.82', 'grad_norm': '2.239', 'learning_rate': '1.828e-05', 'epoch': '0.03658', 'num_input_tokens_seen': 2974291, 'train_runtime': '1505', 'train_tokens_per_second': '1976'} +{'loss': '0.5942', 'grad_norm': '1.641', 'learning_rate': '1.829e-05', 'epoch': '0.03661', 'num_input_tokens_seen': 2976338, 'train_runtime': '1506', 'train_tokens_per_second': '1976'} +{'loss': '0.5823', 'grad_norm': '1.445', 'learning_rate': '1.83e-05', 'epoch': '0.03663', 'num_input_tokens_seen': 2978385, 'train_runtime': '1507', 'train_tokens_per_second': '1976'} +{'loss': '0.4811', 'grad_norm': '1.472', 'learning_rate': '1.832e-05', 'epoch': '0.03666', 'num_input_tokens_seen': 2980432, 'train_runtime': '1508', 'train_tokens_per_second': '1976'} +{'loss': '1.258', 'grad_norm': '1.911', 'learning_rate': '1.833e-05', 'epoch': '0.03668', 'num_input_tokens_seen': 2982479, 'train_runtime': '1510', 'train_tokens_per_second': '1976'} +{'loss': '1.25', 'grad_norm': '2.186', 'learning_rate': '1.834e-05', 'epoch': '0.03671', 'num_input_tokens_seen': 2984526, 'train_runtime': '1511', 'train_tokens_per_second': '1976'} +{'loss': '1.132', 'grad_norm': '1.656', 'learning_rate': '1.835e-05', 'epoch': '0.03673', 'num_input_tokens_seen': 2986573, 'train_runtime': '1512', 'train_tokens_per_second': '1976'} +{'loss': '0.654', 'grad_norm': '1.747', 'learning_rate': '1.837e-05', 'epoch': '0.03676', 'num_input_tokens_seen': 2988620, 'train_runtime': '1513', 'train_tokens_per_second': '1976'} +{'loss': '1.376', 'grad_norm': '2.364', 'learning_rate': '1.838e-05', 'epoch': '0.03679', 'num_input_tokens_seen': 2990667, 'train_runtime': '1514', 'train_tokens_per_second': '1976'} +{'loss': '1.867', 'grad_norm': '2.452', 'learning_rate': '1.839e-05', 'epoch': '0.03681', 'num_input_tokens_seen': 2992714, 'train_runtime': '1515', 'train_tokens_per_second': '1976'} +{'loss': '1.343', 'grad_norm': '2.135', 'learning_rate': '1.84e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 2994761, 'train_runtime': '1516', 'train_tokens_per_second': '1976'} +{'loss': '0.7749', 'grad_norm': '1.671', 'learning_rate': '1.842e-05', 'epoch': '0.03686', 'num_input_tokens_seen': 2996808, 'train_runtime': '1517', 'train_tokens_per_second': '1976'} +{'loss': '0.654', 'grad_norm': '1.681', 'learning_rate': '1.843e-05', 'epoch': '0.03689', 'num_input_tokens_seen': 2998855, 'train_runtime': '1518', 'train_tokens_per_second': '1976'} +{'loss': '1.602', 'grad_norm': '3.17', 'learning_rate': '1.844e-05', 'epoch': '0.03691', 'num_input_tokens_seen': 3000902, 'train_runtime': '1519', 'train_tokens_per_second': '1976'} +{'loss': '0.4747', 'grad_norm': '1.484', 'learning_rate': '1.845e-05', 'epoch': '0.03694', 'num_input_tokens_seen': 3002949, 'train_runtime': '1520', 'train_tokens_per_second': '1976'} +{'loss': '0.4683', 'grad_norm': '1.255', 'learning_rate': '1.847e-05', 'epoch': '0.03696', 'num_input_tokens_seen': 3004996, 'train_runtime': '1521', 'train_tokens_per_second': '1976'} +{'loss': '1.764', 'grad_norm': '3.29', 'learning_rate': '1.848e-05', 'epoch': '0.03699', 'num_input_tokens_seen': 3007043, 'train_runtime': '1522', 'train_tokens_per_second': '1976'} +{'loss': '0.5216', 'grad_norm': '1.529', 'learning_rate': '1.849e-05', 'epoch': '0.03701', 'num_input_tokens_seen': 3009090, 'train_runtime': '1523', 'train_tokens_per_second': '1976'} +{'loss': '1.525', 'grad_norm': '2.134', 'learning_rate': '1.85e-05', 'epoch': '0.03704', 'num_input_tokens_seen': 3011137, 'train_runtime': '1524', 'train_tokens_per_second': '1976'} +{'loss': '2.835', 'grad_norm': '2.841', 'learning_rate': '1.852e-05', 'epoch': '0.03706', 'num_input_tokens_seen': 3013184, 'train_runtime': '1525', 'train_tokens_per_second': '1976'} +{'loss': '2.004', 'grad_norm': '2.829', 'learning_rate': '1.853e-05', 'epoch': '0.03709', 'num_input_tokens_seen': 3015231, 'train_runtime': '1526', 'train_tokens_per_second': '1976'} +{'loss': '0.5643', 'grad_norm': '1.405', 'learning_rate': '1.854e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 3017278, 'train_runtime': '1527', 'train_tokens_per_second': '1976'} +{'loss': '1.244', 'grad_norm': '2.133', 'learning_rate': '1.855e-05', 'epoch': '0.03714', 'num_input_tokens_seen': 3019325, 'train_runtime': '1528', 'train_tokens_per_second': '1976'} +{'loss': '0.7244', 'grad_norm': '1.924', 'learning_rate': '1.857e-05', 'epoch': '0.03716', 'num_input_tokens_seen': 3021372, 'train_runtime': '1529', 'train_tokens_per_second': '1976'} +{'loss': '0.6614', 'grad_norm': '2.011', 'learning_rate': '1.858e-05', 'epoch': '0.03719', 'num_input_tokens_seen': 3023419, 'train_runtime': '1530', 'train_tokens_per_second': '1976'} +{'loss': '0.7477', 'grad_norm': '1.556', 'learning_rate': '1.859e-05', 'epoch': '0.03721', 'num_input_tokens_seen': 3025466, 'train_runtime': '1531', 'train_tokens_per_second': '1976'} +{'loss': '1.814', 'grad_norm': '2.673', 'learning_rate': '1.861e-05', 'epoch': '0.03724', 'num_input_tokens_seen': 3027513, 'train_runtime': '1532', 'train_tokens_per_second': '1976'} +{'loss': '0.5901', 'grad_norm': '1.531', 'learning_rate': '1.862e-05', 'epoch': '0.03726', 'num_input_tokens_seen': 3029560, 'train_runtime': '1533', 'train_tokens_per_second': '1976'} +{'loss': '0.6029', 'grad_norm': '1.882', 'learning_rate': '1.863e-05', 'epoch': '0.03729', 'num_input_tokens_seen': 3031607, 'train_runtime': '1534', 'train_tokens_per_second': '1976'} +{'loss': '1.409', 'grad_norm': '2.298', 'learning_rate': '1.864e-05', 'epoch': '0.03731', 'num_input_tokens_seen': 3033654, 'train_runtime': '1535', 'train_tokens_per_second': '1976'} +{'loss': '1.126', 'grad_norm': '2.004', 'learning_rate': '1.866e-05', 'epoch': '0.03734', 'num_input_tokens_seen': 3035701, 'train_runtime': '1536', 'train_tokens_per_second': '1976'} +{'loss': '0.9008', 'grad_norm': '1.921', 'learning_rate': '1.867e-05', 'epoch': '0.03736', 'num_input_tokens_seen': 3037748, 'train_runtime': '1537', 'train_tokens_per_second': '1976'} +{'loss': '0.5944', 'grad_norm': '1.503', 'learning_rate': '1.868e-05', 'epoch': '0.03739', 'num_input_tokens_seen': 3039795, 'train_runtime': '1538', 'train_tokens_per_second': '1976'} +{'loss': '1.214', 'grad_norm': '2.006', 'learning_rate': '1.869e-05', 'epoch': '0.03741', 'num_input_tokens_seen': 3041842, 'train_runtime': '1539', 'train_tokens_per_second': '1976'} +{'loss': '1.347', 'grad_norm': '2.185', 'learning_rate': '1.871e-05', 'epoch': '0.03744', 'num_input_tokens_seen': 3043889, 'train_runtime': '1541', 'train_tokens_per_second': '1976'} +{'loss': '1.115', 'grad_norm': '1.904', 'learning_rate': '1.872e-05', 'epoch': '0.03747', 'num_input_tokens_seen': 3045936, 'train_runtime': '1542', 'train_tokens_per_second': '1976'} +{'loss': '0.5675', 'grad_norm': '1.622', 'learning_rate': '1.873e-05', 'epoch': '0.03749', 'num_input_tokens_seen': 3047983, 'train_runtime': '1543', 'train_tokens_per_second': '1976'} +{'loss': '2.02', 'grad_norm': '2.464', 'learning_rate': '1.874e-05', 'epoch': '0.03752', 'num_input_tokens_seen': 3050030, 'train_runtime': '1544', 'train_tokens_per_second': '1976'} +{'loss': '1.121', 'grad_norm': '1.834', 'learning_rate': '1.876e-05', 'epoch': '0.03754', 'num_input_tokens_seen': 3052077, 'train_runtime': '1545', 'train_tokens_per_second': '1976'} +{'loss': '1.509', 'grad_norm': '2.393', 'learning_rate': '1.877e-05', 'epoch': '0.03757', 'num_input_tokens_seen': 3054124, 'train_runtime': '1546', 'train_tokens_per_second': '1976'} +{'loss': '1.793', 'grad_norm': '2.588', 'learning_rate': '1.878e-05', 'epoch': '0.03759', 'num_input_tokens_seen': 3056171, 'train_runtime': '1547', 'train_tokens_per_second': '1976'} +{'loss': '0.9295', 'grad_norm': '1.664', 'learning_rate': '1.879e-05', 'epoch': '0.03762', 'num_input_tokens_seen': 3058218, 'train_runtime': '1548', 'train_tokens_per_second': '1976'} +{'loss': '1.474', 'grad_norm': '2.383', 'learning_rate': '1.881e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 3060265, 'train_runtime': '1549', 'train_tokens_per_second': '1976'} +{'loss': '0.9355', 'grad_norm': '1.779', 'learning_rate': '1.882e-05', 'epoch': '0.03767', 'num_input_tokens_seen': 3062312, 'train_runtime': '1550', 'train_tokens_per_second': '1976'} +{'loss': '1.627', 'grad_norm': '2.331', 'learning_rate': '1.883e-05', 'epoch': '0.03769', 'num_input_tokens_seen': 3064359, 'train_runtime': '1551', 'train_tokens_per_second': '1976'} +{'loss': '1.505', 'grad_norm': '2.266', 'learning_rate': '1.884e-05', 'epoch': '0.03772', 'num_input_tokens_seen': 3066406, 'train_runtime': '1552', 'train_tokens_per_second': '1976'} +{'loss': '1.41', 'grad_norm': '2.117', 'learning_rate': '1.886e-05', 'epoch': '0.03774', 'num_input_tokens_seen': 3068453, 'train_runtime': '1553', 'train_tokens_per_second': '1976'} +{'loss': '0.9735', 'grad_norm': '1.974', 'learning_rate': '1.887e-05', 'epoch': '0.03777', 'num_input_tokens_seen': 3070500, 'train_runtime': '1554', 'train_tokens_per_second': '1976'} +{'loss': '0.7372', 'grad_norm': '1.702', 'learning_rate': '1.888e-05', 'epoch': '0.03779', 'num_input_tokens_seen': 3072547, 'train_runtime': '1555', 'train_tokens_per_second': '1976'} +{'loss': '0.6152', 'grad_norm': '1.721', 'learning_rate': '1.889e-05', 'epoch': '0.03782', 'num_input_tokens_seen': 3074594, 'train_runtime': '1556', 'train_tokens_per_second': '1976'} +{'loss': '1.023', 'grad_norm': '2.452', 'learning_rate': '1.891e-05', 'epoch': '0.03784', 'num_input_tokens_seen': 3076641, 'train_runtime': '1557', 'train_tokens_per_second': '1976'} +{'loss': '1.177', 'grad_norm': '2.499', 'learning_rate': '1.892e-05', 'epoch': '0.03787', 'num_input_tokens_seen': 3078688, 'train_runtime': '1558', 'train_tokens_per_second': '1976'} +{'loss': '1.17', 'grad_norm': '2.158', 'learning_rate': '1.893e-05', 'epoch': '0.03789', 'num_input_tokens_seen': 3080735, 'train_runtime': '1559', 'train_tokens_per_second': '1976'} +{'loss': '0.9639', 'grad_norm': '1.915', 'learning_rate': '1.895e-05', 'epoch': '0.03792', 'num_input_tokens_seen': 3082782, 'train_runtime': '1560', 'train_tokens_per_second': '1976'} +{'loss': '0.4168', 'grad_norm': '1.327', 'learning_rate': '1.896e-05', 'epoch': '0.03794', 'num_input_tokens_seen': 3084829, 'train_runtime': '1561', 'train_tokens_per_second': '1976'} +{'loss': '0.446', 'grad_norm': '1.486', 'learning_rate': '1.897e-05', 'epoch': '0.03797', 'num_input_tokens_seen': 3086876, 'train_runtime': '1562', 'train_tokens_per_second': '1976'} +{'loss': '1.065', 'grad_norm': '1.814', 'learning_rate': '1.898e-05', 'epoch': '0.03799', 'num_input_tokens_seen': 3088923, 'train_runtime': '1563', 'train_tokens_per_second': '1976'} +{'loss': '0.6996', 'grad_norm': '1.613', 'learning_rate': '1.9e-05', 'epoch': '0.03802', 'num_input_tokens_seen': 3090970, 'train_runtime': '1564', 'train_tokens_per_second': '1976'} +{'loss': '1.328', 'grad_norm': '2.031', 'learning_rate': '1.901e-05', 'epoch': '0.03804', 'num_input_tokens_seen': 3093017, 'train_runtime': '1565', 'train_tokens_per_second': '1976'} +{'loss': '0.5818', 'grad_norm': '1.702', 'learning_rate': '1.902e-05', 'epoch': '0.03807', 'num_input_tokens_seen': 3095064, 'train_runtime': '1566', 'train_tokens_per_second': '1976'} +{'loss': '2.002', 'grad_norm': '2.775', 'learning_rate': '1.903e-05', 'epoch': '0.03809', 'num_input_tokens_seen': 3097111, 'train_runtime': '1567', 'train_tokens_per_second': '1976'} +{'loss': '1.387', 'grad_norm': '2.55', 'learning_rate': '1.905e-05', 'epoch': '0.03812', 'num_input_tokens_seen': 3099158, 'train_runtime': '1568', 'train_tokens_per_second': '1976'} +{'loss': '0.9797', 'grad_norm': '1.827', 'learning_rate': '1.906e-05', 'epoch': '0.03814', 'num_input_tokens_seen': 3101205, 'train_runtime': '1569', 'train_tokens_per_second': '1976'} +{'loss': '1.442', 'grad_norm': '2.126', 'learning_rate': '1.907e-05', 'epoch': '0.03817', 'num_input_tokens_seen': 3103252, 'train_runtime': '1570', 'train_tokens_per_second': '1976'} +{'loss': '1.104', 'grad_norm': '2.063', 'learning_rate': '1.908e-05', 'epoch': '0.0382', 'num_input_tokens_seen': 3105299, 'train_runtime': '1572', 'train_tokens_per_second': '1976'} +{'loss': '0.9823', 'grad_norm': '1.879', 'learning_rate': '1.91e-05', 'epoch': '0.03822', 'num_input_tokens_seen': 3107346, 'train_runtime': '1573', 'train_tokens_per_second': '1976'} +{'loss': '1.539', 'grad_norm': '2.851', 'learning_rate': '1.911e-05', 'epoch': '0.03825', 'num_input_tokens_seen': 3109393, 'train_runtime': '1574', 'train_tokens_per_second': '1976'} +{'loss': '2.013', 'grad_norm': '3.107', 'learning_rate': '1.912e-05', 'epoch': '0.03827', 'num_input_tokens_seen': 3111440, 'train_runtime': '1575', 'train_tokens_per_second': '1976'} +{'loss': '0.5916', 'grad_norm': '1.754', 'learning_rate': '1.913e-05', 'epoch': '0.0383', 'num_input_tokens_seen': 3113487, 'train_runtime': '1576', 'train_tokens_per_second': '1976'} +{'loss': '1.678', 'grad_norm': '3.461', 'learning_rate': '1.915e-05', 'epoch': '0.03832', 'num_input_tokens_seen': 3115534, 'train_runtime': '1577', 'train_tokens_per_second': '1976'} +{'loss': '1.134', 'grad_norm': '2.203', 'learning_rate': '1.916e-05', 'epoch': '0.03835', 'num_input_tokens_seen': 3117581, 'train_runtime': '1578', 'train_tokens_per_second': '1976'} +{'loss': '2.464', 'grad_norm': '3.324', 'learning_rate': '1.917e-05', 'epoch': '0.03837', 'num_input_tokens_seen': 3119628, 'train_runtime': '1579', 'train_tokens_per_second': '1976'} +{'loss': '0.9388', 'grad_norm': '1.711', 'learning_rate': '1.918e-05', 'epoch': '0.0384', 'num_input_tokens_seen': 3121675, 'train_runtime': '1580', 'train_tokens_per_second': '1976'} +{'loss': '0.5231', 'grad_norm': '1.665', 'learning_rate': '1.92e-05', 'epoch': '0.03842', 'num_input_tokens_seen': 3123722, 'train_runtime': '1581', 'train_tokens_per_second': '1976'} +{'loss': '1.135', 'grad_norm': '2.117', 'learning_rate': '1.921e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 3125769, 'train_runtime': '1582', 'train_tokens_per_second': '1976'} +{'loss': '1.394', 'grad_norm': '2.219', 'learning_rate': '1.922e-05', 'epoch': '0.03847', 'num_input_tokens_seen': 3127816, 'train_runtime': '1583', 'train_tokens_per_second': '1976'} +{'loss': '0.9532', 'grad_norm': '2.013', 'learning_rate': '1.923e-05', 'epoch': '0.0385', 'num_input_tokens_seen': 3129863, 'train_runtime': '1584', 'train_tokens_per_second': '1976'} +{'loss': '1.058', 'grad_norm': '2.283', 'learning_rate': '1.925e-05', 'epoch': '0.03852', 'num_input_tokens_seen': 3131910, 'train_runtime': '1585', 'train_tokens_per_second': '1976'} +{'loss': '0.7609', 'grad_norm': '1.682', 'learning_rate': '1.926e-05', 'epoch': '0.03855', 'num_input_tokens_seen': 3133957, 'train_runtime': '1586', 'train_tokens_per_second': '1976'} +{'loss': '0.8017', 'grad_norm': '1.698', 'learning_rate': '1.927e-05', 'epoch': '0.03857', 'num_input_tokens_seen': 3136004, 'train_runtime': '1587', 'train_tokens_per_second': '1976'} +{'loss': '0.9711', 'grad_norm': '1.993', 'learning_rate': '1.928e-05', 'epoch': '0.0386', 'num_input_tokens_seen': 3138051, 'train_runtime': '1588', 'train_tokens_per_second': '1976'} +{'loss': '1.617', 'grad_norm': '2.639', 'learning_rate': '1.93e-05', 'epoch': '0.03862', 'num_input_tokens_seen': 3140098, 'train_runtime': '1589', 'train_tokens_per_second': '1976'} +{'loss': '0.4081', 'grad_norm': '1.494', 'learning_rate': '1.931e-05', 'epoch': '0.03865', 'num_input_tokens_seen': 3142145, 'train_runtime': '1590', 'train_tokens_per_second': '1976'} +{'loss': '1.123', 'grad_norm': '2.108', 'learning_rate': '1.932e-05', 'epoch': '0.03867', 'num_input_tokens_seen': 3144192, 'train_runtime': '1591', 'train_tokens_per_second': '1976'} +{'loss': '1.103', 'grad_norm': '2.374', 'learning_rate': '1.934e-05', 'epoch': '0.0387', 'num_input_tokens_seen': 3146239, 'train_runtime': '1592', 'train_tokens_per_second': '1976'} +{'loss': '1.027', 'grad_norm': '2.073', 'learning_rate': '1.935e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 3148286, 'train_runtime': '1593', 'train_tokens_per_second': '1976'} +{'loss': '0.4364', 'grad_norm': '1.515', 'learning_rate': '1.936e-05', 'epoch': '0.03875', 'num_input_tokens_seen': 3150333, 'train_runtime': '1594', 'train_tokens_per_second': '1976'} +{'loss': '3.024', 'grad_norm': '3.73', 'learning_rate': '1.937e-05', 'epoch': '0.03877', 'num_input_tokens_seen': 3152380, 'train_runtime': '1595', 'train_tokens_per_second': '1976'} +{'loss': '0.7115', 'grad_norm': '2.017', 'learning_rate': '1.939e-05', 'epoch': '0.0388', 'num_input_tokens_seen': 3154427, 'train_runtime': '1596', 'train_tokens_per_second': '1976'} +{'loss': '0.7028', 'grad_norm': '1.837', 'learning_rate': '1.94e-05', 'epoch': '0.03882', 'num_input_tokens_seen': 3156474, 'train_runtime': '1597', 'train_tokens_per_second': '1976'} +{'loss': '0.8213', 'grad_norm': '1.974', 'learning_rate': '1.941e-05', 'epoch': '0.03885', 'num_input_tokens_seen': 3158521, 'train_runtime': '1598', 'train_tokens_per_second': '1976'} +{'loss': '2.694', 'grad_norm': '3.304', 'learning_rate': '1.942e-05', 'epoch': '0.03888', 'num_input_tokens_seen': 3160568, 'train_runtime': '1599', 'train_tokens_per_second': '1976'} +{'loss': '1.289', 'grad_norm': '1.863', 'learning_rate': '1.944e-05', 'epoch': '0.0389', 'num_input_tokens_seen': 3162615, 'train_runtime': '1600', 'train_tokens_per_second': '1976'} +{'loss': '1.323', 'grad_norm': '2.342', 'learning_rate': '1.945e-05', 'epoch': '0.03893', 'num_input_tokens_seen': 3164662, 'train_runtime': '1602', 'train_tokens_per_second': '1976'} +{'loss': '0.7027', 'grad_norm': '1.921', 'learning_rate': '1.946e-05', 'epoch': '0.03895', 'num_input_tokens_seen': 3166709, 'train_runtime': '1603', 'train_tokens_per_second': '1976'} +{'loss': '1.954', 'grad_norm': '2.741', 'learning_rate': '1.947e-05', 'epoch': '0.03898', 'num_input_tokens_seen': 3168756, 'train_runtime': '1604', 'train_tokens_per_second': '1976'} +{'loss': '0.6045', 'grad_norm': '1.47', 'learning_rate': '1.949e-05', 'epoch': '0.039', 'num_input_tokens_seen': 3170803, 'train_runtime': '1605', 'train_tokens_per_second': '1976'} +{'loss': '1.059', 'grad_norm': '2.082', 'learning_rate': '1.95e-05', 'epoch': '0.03903', 'num_input_tokens_seen': 3172850, 'train_runtime': '1606', 'train_tokens_per_second': '1976'} +{'loss': '1.72', 'grad_norm': '1.985', 'learning_rate': '1.951e-05', 'epoch': '0.03905', 'num_input_tokens_seen': 3174897, 'train_runtime': '1607', 'train_tokens_per_second': '1976'} +{'loss': '0.6057', 'grad_norm': '1.556', 'learning_rate': '1.952e-05', 'epoch': '0.03908', 'num_input_tokens_seen': 3176944, 'train_runtime': '1608', 'train_tokens_per_second': '1976'} +{'loss': '2.988', 'grad_norm': '4.921', 'learning_rate': '1.954e-05', 'epoch': '0.0391', 'num_input_tokens_seen': 3178991, 'train_runtime': '1609', 'train_tokens_per_second': '1976'} +{'loss': '2.45', 'grad_norm': '2.811', 'learning_rate': '1.955e-05', 'epoch': '0.03913', 'num_input_tokens_seen': 3181038, 'train_runtime': '1610', 'train_tokens_per_second': '1976'} +{'loss': '2.539', 'grad_norm': '2.769', 'learning_rate': '1.956e-05', 'epoch': '0.03915', 'num_input_tokens_seen': 3183085, 'train_runtime': '1611', 'train_tokens_per_second': '1976'} +{'loss': '0.5238', 'grad_norm': '1.469', 'learning_rate': '1.957e-05', 'epoch': '0.03918', 'num_input_tokens_seen': 3185132, 'train_runtime': '1612', 'train_tokens_per_second': '1976'} +{'loss': '1.601', 'grad_norm': '3.066', 'learning_rate': '1.959e-05', 'epoch': '0.0392', 'num_input_tokens_seen': 3187179, 'train_runtime': '1613', 'train_tokens_per_second': '1976'} +{'loss': '1.698', 'grad_norm': '2.607', 'learning_rate': '1.96e-05', 'epoch': '0.03923', 'num_input_tokens_seen': 3189226, 'train_runtime': '1614', 'train_tokens_per_second': '1976'} +{'loss': '2.835', 'grad_norm': '3.077', 'learning_rate': '1.961e-05', 'epoch': '0.03925', 'num_input_tokens_seen': 3191273, 'train_runtime': '1615', 'train_tokens_per_second': '1976'} +{'loss': '1.173', 'grad_norm': '2.162', 'learning_rate': '1.962e-05', 'epoch': '0.03928', 'num_input_tokens_seen': 3193320, 'train_runtime': '1616', 'train_tokens_per_second': '1976'} +{'loss': '2.104', 'grad_norm': '2.96', 'learning_rate': '1.964e-05', 'epoch': '0.0393', 'num_input_tokens_seen': 3195367, 'train_runtime': '1617', 'train_tokens_per_second': '1976'} +{'loss': '0.6584', 'grad_norm': '1.545', 'learning_rate': '1.965e-05', 'epoch': '0.03933', 'num_input_tokens_seen': 3197414, 'train_runtime': '1618', 'train_tokens_per_second': '1976'} +{'loss': '0.6752', 'grad_norm': '1.985', 'learning_rate': '1.966e-05', 'epoch': '0.03935', 'num_input_tokens_seen': 3199461, 'train_runtime': '1619', 'train_tokens_per_second': '1976'} +{'loss': '0.466', 'grad_norm': '1.392', 'learning_rate': '1.968e-05', 'epoch': '0.03938', 'num_input_tokens_seen': 3201508, 'train_runtime': '1620', 'train_tokens_per_second': '1976'} +{'loss': '0.5457', 'grad_norm': '1.64', 'learning_rate': '1.969e-05', 'epoch': '0.0394', 'num_input_tokens_seen': 3203555, 'train_runtime': '1621', 'train_tokens_per_second': '1976'} +{'loss': '0.559', 'grad_norm': '1.72', 'learning_rate': '1.97e-05', 'epoch': '0.03943', 'num_input_tokens_seen': 3205602, 'train_runtime': '1622', 'train_tokens_per_second': '1976'} +{'loss': '0.4668', 'grad_norm': '1.422', 'learning_rate': '1.971e-05', 'epoch': '0.03945', 'num_input_tokens_seen': 3207649, 'train_runtime': '1623', 'train_tokens_per_second': '1976'} +{'loss': '1.972', 'grad_norm': '2.585', 'learning_rate': '1.973e-05', 'epoch': '0.03948', 'num_input_tokens_seen': 3209696, 'train_runtime': '1624', 'train_tokens_per_second': '1976'} +{'loss': '1.693', 'grad_norm': '2.693', 'learning_rate': '1.974e-05', 'epoch': '0.0395', 'num_input_tokens_seen': 3211743, 'train_runtime': '1625', 'train_tokens_per_second': '1976'} +{'loss': '0.75', 'grad_norm': '1.851', 'learning_rate': '1.975e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 3213790, 'train_runtime': '1626', 'train_tokens_per_second': '1976'} +{'loss': '1.092', 'grad_norm': '2.343', 'learning_rate': '1.976e-05', 'epoch': '0.03955', 'num_input_tokens_seen': 3215837, 'train_runtime': '1627', 'train_tokens_per_second': '1976'} +{'loss': '1.258', 'grad_norm': '2.262', 'learning_rate': '1.978e-05', 'epoch': '0.03958', 'num_input_tokens_seen': 3217884, 'train_runtime': '1628', 'train_tokens_per_second': '1976'} +{'loss': '1.165', 'grad_norm': '2.249', 'learning_rate': '1.979e-05', 'epoch': '0.03961', 'num_input_tokens_seen': 3219931, 'train_runtime': '1629', 'train_tokens_per_second': '1976'} +{'loss': '1.135', 'grad_norm': '2.229', 'learning_rate': '1.98e-05', 'epoch': '0.03963', 'num_input_tokens_seen': 3221978, 'train_runtime': '1630', 'train_tokens_per_second': '1976'} +{'loss': '0.5092', 'grad_norm': '1.504', 'learning_rate': '1.981e-05', 'epoch': '0.03966', 'num_input_tokens_seen': 3224025, 'train_runtime': '1632', 'train_tokens_per_second': '1976'} +{'loss': '0.4759', 'grad_norm': '1.523', 'learning_rate': '1.983e-05', 'epoch': '0.03968', 'num_input_tokens_seen': 3226072, 'train_runtime': '1633', 'train_tokens_per_second': '1976'} +{'loss': '1.006', 'grad_norm': '2.382', 'learning_rate': '1.984e-05', 'epoch': '0.03971', 'num_input_tokens_seen': 3228119, 'train_runtime': '1634', 'train_tokens_per_second': '1976'} +{'loss': '1.206', 'grad_norm': '2.166', 'learning_rate': '1.985e-05', 'epoch': '0.03973', 'num_input_tokens_seen': 3230166, 'train_runtime': '1635', 'train_tokens_per_second': '1976'} +{'loss': '1.908', 'grad_norm': '2.594', 'learning_rate': '1.986e-05', 'epoch': '0.03976', 'num_input_tokens_seen': 3232213, 'train_runtime': '1636', 'train_tokens_per_second': '1976'} +{'loss': '0.8346', 'grad_norm': '2.023', 'learning_rate': '1.988e-05', 'epoch': '0.03978', 'num_input_tokens_seen': 3234260, 'train_runtime': '1637', 'train_tokens_per_second': '1976'} +{'loss': '1.13', 'grad_norm': '2.248', 'learning_rate': '1.989e-05', 'epoch': '0.03981', 'num_input_tokens_seen': 3236307, 'train_runtime': '1638', 'train_tokens_per_second': '1976'} +{'loss': '0.5027', 'grad_norm': '1.495', 'learning_rate': '1.99e-05', 'epoch': '0.03983', 'num_input_tokens_seen': 3238354, 'train_runtime': '1639', 'train_tokens_per_second': '1976'} +{'loss': '0.4798', 'grad_norm': '1.442', 'learning_rate': '1.991e-05', 'epoch': '0.03986', 'num_input_tokens_seen': 3240401, 'train_runtime': '1640', 'train_tokens_per_second': '1976'} +{'loss': '0.8695', 'grad_norm': '1.725', 'learning_rate': '1.993e-05', 'epoch': '0.03988', 'num_input_tokens_seen': 3242448, 'train_runtime': '1641', 'train_tokens_per_second': '1976'} +{'loss': '0.881', 'grad_norm': '1.75', 'learning_rate': '1.994e-05', 'epoch': '0.03991', 'num_input_tokens_seen': 3244495, 'train_runtime': '1642', 'train_tokens_per_second': '1976'} +{'loss': '2.539', 'grad_norm': '2.869', 'learning_rate': '1.995e-05', 'epoch': '0.03993', 'num_input_tokens_seen': 3246542, 'train_runtime': '1643', 'train_tokens_per_second': '1976'} +{'loss': '0.5052', 'grad_norm': '1.215', 'learning_rate': '1.996e-05', 'epoch': '0.03996', 'num_input_tokens_seen': 3248589, 'train_runtime': '1644', 'train_tokens_per_second': '1976'} +{'loss': '0.6465', 'grad_norm': '1.755', 'learning_rate': '1.998e-05', 'epoch': '0.03998', 'num_input_tokens_seen': 3250636, 'train_runtime': '1645', 'train_tokens_per_second': '1976'} +{'loss': '1.198', 'grad_norm': '2.004', 'learning_rate': '1.999e-05', 'epoch': '0.04001', 'num_input_tokens_seen': 3252683, 'train_runtime': '1646', 'train_tokens_per_second': '1976'} +{'loss': '1.838', 'grad_norm': '2.613', 'learning_rate': '2e-05', 'epoch': '0.04003', 'num_input_tokens_seen': 3254730, 'train_runtime': '1647', 'train_tokens_per_second': '1976'} +{'loss': '2.013', 'grad_norm': '2.684', 'learning_rate': '2.002e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 3256777, 'train_runtime': '1648', 'train_tokens_per_second': '1976'} +{'loss': '0.508', 'grad_norm': '1.821', 'learning_rate': '2.003e-05', 'epoch': '0.04008', 'num_input_tokens_seen': 3258824, 'train_runtime': '1649', 'train_tokens_per_second': '1976'} +{'loss': '0.7344', 'grad_norm': '1.678', 'learning_rate': '2.004e-05', 'epoch': '0.04011', 'num_input_tokens_seen': 3260871, 'train_runtime': '1650', 'train_tokens_per_second': '1976'} +{'loss': '0.8463', 'grad_norm': '2.156', 'learning_rate': '2.005e-05', 'epoch': '0.04013', 'num_input_tokens_seen': 3262918, 'train_runtime': '1651', 'train_tokens_per_second': '1976'} +{'loss': '0.8831', 'grad_norm': '2.239', 'learning_rate': '2.007e-05', 'epoch': '0.04016', 'num_input_tokens_seen': 3264965, 'train_runtime': '1652', 'train_tokens_per_second': '1976'} +{'loss': '0.9886', 'grad_norm': '2.448', 'learning_rate': '2.008e-05', 'epoch': '0.04018', 'num_input_tokens_seen': 3267012, 'train_runtime': '1653', 'train_tokens_per_second': '1976'} +{'loss': '1.729', 'grad_norm': '2.749', 'learning_rate': '2.009e-05', 'epoch': '0.04021', 'num_input_tokens_seen': 3269059, 'train_runtime': '1654', 'train_tokens_per_second': '1976'} +{'loss': '1.297', 'grad_norm': '2.153', 'learning_rate': '2.01e-05', 'epoch': '0.04023', 'num_input_tokens_seen': 3271106, 'train_runtime': '1655', 'train_tokens_per_second': '1976'} +{'loss': '1.84', 'grad_norm': '3.006', 'learning_rate': '2.012e-05', 'epoch': '0.04026', 'num_input_tokens_seen': 3273153, 'train_runtime': '1656', 'train_tokens_per_second': '1976'} +{'loss': '1.125', 'grad_norm': '2.372', 'learning_rate': '2.013e-05', 'epoch': '0.04029', 'num_input_tokens_seen': 3275200, 'train_runtime': '1657', 'train_tokens_per_second': '1976'} +{'loss': '2.39', 'grad_norm': '2.696', 'learning_rate': '2.014e-05', 'epoch': '0.04031', 'num_input_tokens_seen': 3277247, 'train_runtime': '1658', 'train_tokens_per_second': '1976'} +{'loss': '1.099', 'grad_norm': '2.835', 'learning_rate': '2.015e-05', 'epoch': '0.04034', 'num_input_tokens_seen': 3279294, 'train_runtime': '1659', 'train_tokens_per_second': '1976'} +{'loss': '0.487', 'grad_norm': '1.556', 'learning_rate': '2.017e-05', 'epoch': '0.04036', 'num_input_tokens_seen': 3281341, 'train_runtime': '1660', 'train_tokens_per_second': '1976'} +{'loss': '0.7055', 'grad_norm': '1.771', 'learning_rate': '2.018e-05', 'epoch': '0.04039', 'num_input_tokens_seen': 3283388, 'train_runtime': '1661', 'train_tokens_per_second': '1976'} +{'loss': '0.7414', 'grad_norm': '1.99', 'learning_rate': '2.019e-05', 'epoch': '0.04041', 'num_input_tokens_seen': 3285435, 'train_runtime': '1663', 'train_tokens_per_second': '1976'} +{'loss': '0.8289', 'grad_norm': '2.262', 'learning_rate': '2.02e-05', 'epoch': '0.04044', 'num_input_tokens_seen': 3287482, 'train_runtime': '1664', 'train_tokens_per_second': '1976'} +{'loss': '1.053', 'grad_norm': '1.98', 'learning_rate': '2.022e-05', 'epoch': '0.04046', 'num_input_tokens_seen': 3289529, 'train_runtime': '1665', 'train_tokens_per_second': '1976'} +{'loss': '1.186', 'grad_norm': '2.261', 'learning_rate': '2.023e-05', 'epoch': '0.04049', 'num_input_tokens_seen': 3291576, 'train_runtime': '1666', 'train_tokens_per_second': '1976'} +{'loss': '0.5125', 'grad_norm': '1.733', 'learning_rate': '2.024e-05', 'epoch': '0.04051', 'num_input_tokens_seen': 3293623, 'train_runtime': '1667', 'train_tokens_per_second': '1976'} +{'loss': '0.888', 'grad_norm': '2.345', 'learning_rate': '2.025e-05', 'epoch': '0.04054', 'num_input_tokens_seen': 3295670, 'train_runtime': '1668', 'train_tokens_per_second': '1976'} +{'loss': '0.5831', 'grad_norm': '1.582', 'learning_rate': '2.027e-05', 'epoch': '0.04056', 'num_input_tokens_seen': 3297717, 'train_runtime': '1669', 'train_tokens_per_second': '1976'} +{'loss': '1.957', 'grad_norm': '2.863', 'learning_rate': '2.028e-05', 'epoch': '0.04059', 'num_input_tokens_seen': 3299764, 'train_runtime': '1670', 'train_tokens_per_second': '1976'} +{'loss': '0.7196', 'grad_norm': '2.345', 'learning_rate': '2.029e-05', 'epoch': '0.04061', 'num_input_tokens_seen': 3301811, 'train_runtime': '1671', 'train_tokens_per_second': '1976'} +{'loss': '1.199', 'grad_norm': '2.389', 'learning_rate': '2.03e-05', 'epoch': '0.04064', 'num_input_tokens_seen': 3303858, 'train_runtime': '1672', 'train_tokens_per_second': '1976'} +{'loss': '1.102', 'grad_norm': '1.907', 'learning_rate': '2.032e-05', 'epoch': '0.04066', 'num_input_tokens_seen': 3305905, 'train_runtime': '1673', 'train_tokens_per_second': '1976'} +{'loss': '1.081', 'grad_norm': '2.302', 'learning_rate': '2.033e-05', 'epoch': '0.04069', 'num_input_tokens_seen': 3307952, 'train_runtime': '1674', 'train_tokens_per_second': '1976'} +{'loss': '1.118', 'grad_norm': '2.504', 'learning_rate': '2.034e-05', 'epoch': '0.04071', 'num_input_tokens_seen': 3309999, 'train_runtime': '1675', 'train_tokens_per_second': '1976'} +{'loss': '1.415', 'grad_norm': '2.558', 'learning_rate': '2.035e-05', 'epoch': '0.04074', 'num_input_tokens_seen': 3312046, 'train_runtime': '1676', 'train_tokens_per_second': '1976'} +{'loss': '1.276', 'grad_norm': '2.397', 'learning_rate': '2.037e-05', 'epoch': '0.04076', 'num_input_tokens_seen': 3314093, 'train_runtime': '1677', 'train_tokens_per_second': '1976'} +{'loss': '1.4', 'grad_norm': '2.929', 'learning_rate': '2.038e-05', 'epoch': '0.04079', 'num_input_tokens_seen': 3316140, 'train_runtime': '1678', 'train_tokens_per_second': '1976'} +{'loss': '1.135', 'grad_norm': '2.583', 'learning_rate': '2.039e-05', 'epoch': '0.04081', 'num_input_tokens_seen': 3318187, 'train_runtime': '1679', 'train_tokens_per_second': '1976'} +{'loss': '0.9688', 'grad_norm': '2.31', 'learning_rate': '2.041e-05', 'epoch': '0.04084', 'num_input_tokens_seen': 3320234, 'train_runtime': '1680', 'train_tokens_per_second': '1976'} +{'loss': '0.9517', 'grad_norm': '1.821', 'learning_rate': '2.042e-05', 'epoch': '0.04086', 'num_input_tokens_seen': 3322281, 'train_runtime': '1681', 'train_tokens_per_second': '1976'} +{'loss': '1.055', 'grad_norm': '2.249', 'learning_rate': '2.043e-05', 'epoch': '0.04089', 'num_input_tokens_seen': 3324328, 'train_runtime': '1682', 'train_tokens_per_second': '1976'} +{'loss': '0.752', 'grad_norm': '1.946', 'learning_rate': '2.044e-05', 'epoch': '0.04091', 'num_input_tokens_seen': 3326375, 'train_runtime': '1683', 'train_tokens_per_second': '1976'} +{'loss': '1.9', 'grad_norm': '2.707', 'learning_rate': '2.046e-05', 'epoch': '0.04094', 'num_input_tokens_seen': 3328422, 'train_runtime': '1684', 'train_tokens_per_second': '1976'} +{'loss': '1.38', 'grad_norm': '2.414', 'learning_rate': '2.047e-05', 'epoch': '0.04096', 'num_input_tokens_seen': 3330469, 'train_runtime': '1685', 'train_tokens_per_second': '1976'} +{'loss': '0.7678', 'grad_norm': '1.798', 'learning_rate': '2.048e-05', 'epoch': '0.04099', 'num_input_tokens_seen': 3332516, 'train_runtime': '1686', 'train_tokens_per_second': '1976'} +{'loss': '0.5044', 'grad_norm': '1.52', 'learning_rate': '2.049e-05', 'epoch': '0.04102', 'num_input_tokens_seen': 3334563, 'train_runtime': '1687', 'train_tokens_per_second': '1976'} +{'loss': '0.4503', 'grad_norm': '1.495', 'learning_rate': '2.051e-05', 'epoch': '0.04104', 'num_input_tokens_seen': 3336610, 'train_runtime': '1688', 'train_tokens_per_second': '1976'} +{'loss': '0.5581', 'grad_norm': '1.426', 'learning_rate': '2.052e-05', 'epoch': '0.04107', 'num_input_tokens_seen': 3338657, 'train_runtime': '1689', 'train_tokens_per_second': '1976'} +{'loss': '2.746', 'grad_norm': '3.719', 'learning_rate': '2.053e-05', 'epoch': '0.04109', 'num_input_tokens_seen': 3340704, 'train_runtime': '1690', 'train_tokens_per_second': '1976'} +{'loss': '1.179', 'grad_norm': '2.443', 'learning_rate': '2.054e-05', 'epoch': '0.04112', 'num_input_tokens_seen': 3342751, 'train_runtime': '1691', 'train_tokens_per_second': '1976'} +{'loss': '0.8815', 'grad_norm': '1.854', 'learning_rate': '2.056e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 3344798, 'train_runtime': '1693', 'train_tokens_per_second': '1976'} +{'loss': '1.021', 'grad_norm': '2.175', 'learning_rate': '2.057e-05', 'epoch': '0.04117', 'num_input_tokens_seen': 3346845, 'train_runtime': '1694', 'train_tokens_per_second': '1976'} +{'loss': '2.15', 'grad_norm': '3.094', 'learning_rate': '2.058e-05', 'epoch': '0.04119', 'num_input_tokens_seen': 3348892, 'train_runtime': '1695', 'train_tokens_per_second': '1976'} +{'loss': '1.643', 'grad_norm': '2.75', 'learning_rate': '2.059e-05', 'epoch': '0.04122', 'num_input_tokens_seen': 3350939, 'train_runtime': '1696', 'train_tokens_per_second': '1976'} +{'loss': '0.3986', 'grad_norm': '1.559', 'learning_rate': '2.061e-05', 'epoch': '0.04124', 'num_input_tokens_seen': 3352986, 'train_runtime': '1697', 'train_tokens_per_second': '1976'} +{'loss': '0.7306', 'grad_norm': '1.532', 'learning_rate': '2.062e-05', 'epoch': '0.04127', 'num_input_tokens_seen': 3355033, 'train_runtime': '1698', 'train_tokens_per_second': '1976'} +{'loss': '1.393', 'grad_norm': '2.832', 'learning_rate': '2.063e-05', 'epoch': '0.04129', 'num_input_tokens_seen': 3357080, 'train_runtime': '1699', 'train_tokens_per_second': '1976'} +{'loss': '0.5704', 'grad_norm': '1.573', 'learning_rate': '2.064e-05', 'epoch': '0.04132', 'num_input_tokens_seen': 3359127, 'train_runtime': '1700', 'train_tokens_per_second': '1976'} +{'loss': '1.588', 'grad_norm': '2.65', 'learning_rate': '2.066e-05', 'epoch': '0.04134', 'num_input_tokens_seen': 3361174, 'train_runtime': '1701', 'train_tokens_per_second': '1976'} +{'loss': '1.229', 'grad_norm': '2.522', 'learning_rate': '2.067e-05', 'epoch': '0.04137', 'num_input_tokens_seen': 3363221, 'train_runtime': '1702', 'train_tokens_per_second': '1976'} +{'loss': '1.102', 'grad_norm': '2.087', 'learning_rate': '2.068e-05', 'epoch': '0.04139', 'num_input_tokens_seen': 3365268, 'train_runtime': '1703', 'train_tokens_per_second': '1976'} +{'loss': '0.6978', 'grad_norm': '1.624', 'learning_rate': '2.069e-05', 'epoch': '0.04142', 'num_input_tokens_seen': 3367315, 'train_runtime': '1704', 'train_tokens_per_second': '1976'} +{'loss': '0.6826', 'grad_norm': '1.927', 'learning_rate': '2.071e-05', 'epoch': '0.04144', 'num_input_tokens_seen': 3369362, 'train_runtime': '1705', 'train_tokens_per_second': '1976'} +{'loss': '1.142', 'grad_norm': '2.262', 'learning_rate': '2.072e-05', 'epoch': '0.04147', 'num_input_tokens_seen': 3371409, 'train_runtime': '1706', 'train_tokens_per_second': '1976'} +{'loss': '1.484', 'grad_norm': '2.902', 'learning_rate': '2.073e-05', 'epoch': '0.04149', 'num_input_tokens_seen': 3373456, 'train_runtime': '1707', 'train_tokens_per_second': '1976'} +{'loss': '0.5436', 'grad_norm': '1.453', 'learning_rate': '2.075e-05', 'epoch': '0.04152', 'num_input_tokens_seen': 3375503, 'train_runtime': '1708', 'train_tokens_per_second': '1976'} +{'loss': '0.797', 'grad_norm': '1.691', 'learning_rate': '2.076e-05', 'epoch': '0.04154', 'num_input_tokens_seen': 3377550, 'train_runtime': '1709', 'train_tokens_per_second': '1976'} +{'loss': '2.219', 'grad_norm': '3.7', 'learning_rate': '2.077e-05', 'epoch': '0.04157', 'num_input_tokens_seen': 3379597, 'train_runtime': '1710', 'train_tokens_per_second': '1976'} +{'loss': '0.6698', 'grad_norm': '1.885', 'learning_rate': '2.078e-05', 'epoch': '0.04159', 'num_input_tokens_seen': 3381644, 'train_runtime': '1711', 'train_tokens_per_second': '1976'} +{'loss': '0.9658', 'grad_norm': '2.243', 'learning_rate': '2.08e-05', 'epoch': '0.04162', 'num_input_tokens_seen': 3383691, 'train_runtime': '1712', 'train_tokens_per_second': '1976'} +{'loss': '1.766', 'grad_norm': '3.748', 'learning_rate': '2.081e-05', 'epoch': '0.04164', 'num_input_tokens_seen': 3385738, 'train_runtime': '1713', 'train_tokens_per_second': '1976'} +{'loss': '0.5238', 'grad_norm': '1.795', 'learning_rate': '2.082e-05', 'epoch': '0.04167', 'num_input_tokens_seen': 3387785, 'train_runtime': '1714', 'train_tokens_per_second': '1976'} +{'loss': '0.7606', 'grad_norm': '1.758', 'learning_rate': '2.083e-05', 'epoch': '0.04169', 'num_input_tokens_seen': 3389832, 'train_runtime': '1715', 'train_tokens_per_second': '1976'} +{'loss': '0.6203', 'grad_norm': '1.71', 'learning_rate': '2.085e-05', 'epoch': '0.04172', 'num_input_tokens_seen': 3391879, 'train_runtime': '1716', 'train_tokens_per_second': '1976'} +{'loss': '0.6677', 'grad_norm': '1.923', 'learning_rate': '2.086e-05', 'epoch': '0.04175', 'num_input_tokens_seen': 3393926, 'train_runtime': '1717', 'train_tokens_per_second': '1976'} +{'loss': '2.62', 'grad_norm': '2.892', 'learning_rate': '2.087e-05', 'epoch': '0.04177', 'num_input_tokens_seen': 3395973, 'train_runtime': '1718', 'train_tokens_per_second': '1976'} +{'loss': '1.33', 'grad_norm': '2.068', 'learning_rate': '2.088e-05', 'epoch': '0.0418', 'num_input_tokens_seen': 3398020, 'train_runtime': '1719', 'train_tokens_per_second': '1976'} +{'loss': '0.7913', 'grad_norm': '2.082', 'learning_rate': '2.09e-05', 'epoch': '0.04182', 'num_input_tokens_seen': 3400067, 'train_runtime': '1720', 'train_tokens_per_second': '1976'} +{'loss': '2.948', 'grad_norm': '3.097', 'learning_rate': '2.091e-05', 'epoch': '0.04185', 'num_input_tokens_seen': 3402114, 'train_runtime': '1721', 'train_tokens_per_second': '1976'} +{'loss': '1.003', 'grad_norm': '2.211', 'learning_rate': '2.092e-05', 'epoch': '0.04187', 'num_input_tokens_seen': 3404161, 'train_runtime': '1722', 'train_tokens_per_second': '1976'} +{'loss': '0.4494', 'grad_norm': '1.411', 'learning_rate': '2.093e-05', 'epoch': '0.0419', 'num_input_tokens_seen': 3406208, 'train_runtime': '1724', 'train_tokens_per_second': '1976'} +{'loss': '1.852', 'grad_norm': '2.571', 'learning_rate': '2.095e-05', 'epoch': '0.04192', 'num_input_tokens_seen': 3408255, 'train_runtime': '1725', 'train_tokens_per_second': '1976'} +{'loss': '1.672', 'grad_norm': '2.57', 'learning_rate': '2.096e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 3410302, 'train_runtime': '1726', 'train_tokens_per_second': '1976'} +{'loss': '0.989', 'grad_norm': '1.959', 'learning_rate': '2.097e-05', 'epoch': '0.04197', 'num_input_tokens_seen': 3412349, 'train_runtime': '1727', 'train_tokens_per_second': '1976'} +{'loss': '0.9114', 'grad_norm': '2.479', 'learning_rate': '2.098e-05', 'epoch': '0.042', 'num_input_tokens_seen': 3414396, 'train_runtime': '1728', 'train_tokens_per_second': '1976'} +{'loss': '0.7086', 'grad_norm': '1.849', 'learning_rate': '2.1e-05', 'epoch': '0.04202', 'num_input_tokens_seen': 3416443, 'train_runtime': '1729', 'train_tokens_per_second': '1976'} +{'loss': '0.5135', 'grad_norm': '1.521', 'learning_rate': '2.101e-05', 'epoch': '0.04205', 'num_input_tokens_seen': 3418490, 'train_runtime': '1730', 'train_tokens_per_second': '1976'} +{'loss': '1.92', 'grad_norm': '2.845', 'learning_rate': '2.102e-05', 'epoch': '0.04207', 'num_input_tokens_seen': 3420537, 'train_runtime': '1731', 'train_tokens_per_second': '1976'} +{'loss': '0.9823', 'grad_norm': '2.236', 'learning_rate': '2.103e-05', 'epoch': '0.0421', 'num_input_tokens_seen': 3422584, 'train_runtime': '1732', 'train_tokens_per_second': '1976'} +{'loss': '1.06', 'grad_norm': '2.437', 'learning_rate': '2.105e-05', 'epoch': '0.04212', 'num_input_tokens_seen': 3424631, 'train_runtime': '1733', 'train_tokens_per_second': '1976'} +{'loss': '1.58', 'grad_norm': '2.951', 'learning_rate': '2.106e-05', 'epoch': '0.04215', 'num_input_tokens_seen': 3426678, 'train_runtime': '1734', 'train_tokens_per_second': '1976'} +{'loss': '0.917', 'grad_norm': '1.985', 'learning_rate': '2.107e-05', 'epoch': '0.04217', 'num_input_tokens_seen': 3428725, 'train_runtime': '1735', 'train_tokens_per_second': '1976'} +{'loss': '1.449', 'grad_norm': '3.301', 'learning_rate': '2.109e-05', 'epoch': '0.0422', 'num_input_tokens_seen': 3430772, 'train_runtime': '1736', 'train_tokens_per_second': '1976'} +{'loss': '1.207', 'grad_norm': '2.428', 'learning_rate': '2.11e-05', 'epoch': '0.04222', 'num_input_tokens_seen': 3432819, 'train_runtime': '1737', 'train_tokens_per_second': '1976'} +{'loss': '1.219', 'grad_norm': '2.243', 'learning_rate': '2.111e-05', 'epoch': '0.04225', 'num_input_tokens_seen': 3434866, 'train_runtime': '1738', 'train_tokens_per_second': '1976'} +{'loss': '1.604', 'grad_norm': '2.834', 'learning_rate': '2.112e-05', 'epoch': '0.04227', 'num_input_tokens_seen': 3436913, 'train_runtime': '1739', 'train_tokens_per_second': '1976'} +{'loss': '0.4406', 'grad_norm': '1.565', 'learning_rate': '2.114e-05', 'epoch': '0.0423', 'num_input_tokens_seen': 3438960, 'train_runtime': '1740', 'train_tokens_per_second': '1976'} +{'loss': '1.191', 'grad_norm': '2.092', 'learning_rate': '2.115e-05', 'epoch': '0.04232', 'num_input_tokens_seen': 3441007, 'train_runtime': '1741', 'train_tokens_per_second': '1976'} +{'loss': '0.4214', 'grad_norm': '1.463', 'learning_rate': '2.116e-05', 'epoch': '0.04235', 'num_input_tokens_seen': 3443054, 'train_runtime': '1742', 'train_tokens_per_second': '1976'} +{'loss': '0.7299', 'grad_norm': '1.733', 'learning_rate': '2.117e-05', 'epoch': '0.04237', 'num_input_tokens_seen': 3445101, 'train_runtime': '1743', 'train_tokens_per_second': '1976'} +{'loss': '0.4932', 'grad_norm': '1.593', 'learning_rate': '2.119e-05', 'epoch': '0.0424', 'num_input_tokens_seen': 3447148, 'train_runtime': '1744', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '2.155', 'learning_rate': '2.12e-05', 'epoch': '0.04243', 'num_input_tokens_seen': 3449195, 'train_runtime': '1745', 'train_tokens_per_second': '1976'} +{'loss': '2.013', 'grad_norm': '3.006', 'learning_rate': '2.121e-05', 'epoch': '0.04245', 'num_input_tokens_seen': 3451242, 'train_runtime': '1746', 'train_tokens_per_second': '1976'} +{'loss': '1.933', 'grad_norm': '3.393', 'learning_rate': '2.122e-05', 'epoch': '0.04248', 'num_input_tokens_seen': 3453289, 'train_runtime': '1747', 'train_tokens_per_second': '1976'} +{'loss': '1.102', 'grad_norm': '2.404', 'learning_rate': '2.124e-05', 'epoch': '0.0425', 'num_input_tokens_seen': 3455336, 'train_runtime': '1748', 'train_tokens_per_second': '1976'} +{'loss': '0.543', 'grad_norm': '1.536', 'learning_rate': '2.125e-05', 'epoch': '0.04253', 'num_input_tokens_seen': 3457383, 'train_runtime': '1749', 'train_tokens_per_second': '1976'} +{'loss': '1.876', 'grad_norm': '3.142', 'learning_rate': '2.126e-05', 'epoch': '0.04255', 'num_input_tokens_seen': 3459430, 'train_runtime': '1750', 'train_tokens_per_second': '1976'} +{'loss': '1.842', 'grad_norm': '2.799', 'learning_rate': '2.127e-05', 'epoch': '0.04258', 'num_input_tokens_seen': 3461477, 'train_runtime': '1751', 'train_tokens_per_second': '1976'} +{'loss': '1.82', 'grad_norm': '3.257', 'learning_rate': '2.129e-05', 'epoch': '0.0426', 'num_input_tokens_seen': 3463524, 'train_runtime': '1752', 'train_tokens_per_second': '1976'} +{'loss': '1.263', 'grad_norm': '2.22', 'learning_rate': '2.13e-05', 'epoch': '0.04263', 'num_input_tokens_seen': 3465571, 'train_runtime': '1753', 'train_tokens_per_second': '1976'} +{'loss': '1.052', 'grad_norm': '2.061', 'learning_rate': '2.131e-05', 'epoch': '0.04265', 'num_input_tokens_seen': 3467618, 'train_runtime': '1755', 'train_tokens_per_second': '1976'} +{'loss': '0.5575', 'grad_norm': '1.754', 'learning_rate': '2.132e-05', 'epoch': '0.04268', 'num_input_tokens_seen': 3469665, 'train_runtime': '1756', 'train_tokens_per_second': '1976'} +{'loss': '0.8323', 'grad_norm': '1.946', 'learning_rate': '2.134e-05', 'epoch': '0.0427', 'num_input_tokens_seen': 3471712, 'train_runtime': '1757', 'train_tokens_per_second': '1976'} +{'loss': '1.173', 'grad_norm': '2.343', 'learning_rate': '2.135e-05', 'epoch': '0.04273', 'num_input_tokens_seen': 3473759, 'train_runtime': '1758', 'train_tokens_per_second': '1976'} +{'loss': '0.5939', 'grad_norm': '2.188', 'learning_rate': '2.136e-05', 'epoch': '0.04275', 'num_input_tokens_seen': 3475806, 'train_runtime': '1759', 'train_tokens_per_second': '1976'} +{'loss': '0.9592', 'grad_norm': '2.536', 'learning_rate': '2.137e-05', 'epoch': '0.04278', 'num_input_tokens_seen': 3477853, 'train_runtime': '1760', 'train_tokens_per_second': '1976'} +{'loss': '1.693', 'grad_norm': '2.828', 'learning_rate': '2.139e-05', 'epoch': '0.0428', 'num_input_tokens_seen': 3479900, 'train_runtime': '1761', 'train_tokens_per_second': '1976'} +{'loss': '2.463', 'grad_norm': '3.691', 'learning_rate': '2.14e-05', 'epoch': '0.04283', 'num_input_tokens_seen': 3481947, 'train_runtime': '1762', 'train_tokens_per_second': '1976'} +{'loss': '0.5249', 'grad_norm': '1.9', 'learning_rate': '2.141e-05', 'epoch': '0.04285', 'num_input_tokens_seen': 3483994, 'train_runtime': '1763', 'train_tokens_per_second': '1976'} +{'loss': '1.934', 'grad_norm': '2.888', 'learning_rate': '2.142e-05', 'epoch': '0.04288', 'num_input_tokens_seen': 3486041, 'train_runtime': '1764', 'train_tokens_per_second': '1976'} +{'loss': '0.8935', 'grad_norm': '2.231', 'learning_rate': '2.144e-05', 'epoch': '0.0429', 'num_input_tokens_seen': 3488088, 'train_runtime': '1765', 'train_tokens_per_second': '1976'} +{'loss': '0.8456', 'grad_norm': '2.161', 'learning_rate': '2.145e-05', 'epoch': '0.04293', 'num_input_tokens_seen': 3490135, 'train_runtime': '1766', 'train_tokens_per_second': '1976'} +{'loss': '1.124', 'grad_norm': '2.573', 'learning_rate': '2.146e-05', 'epoch': '0.04295', 'num_input_tokens_seen': 3492182, 'train_runtime': '1767', 'train_tokens_per_second': '1976'} +{'loss': '0.7668', 'grad_norm': '1.94', 'learning_rate': '2.148e-05', 'epoch': '0.04298', 'num_input_tokens_seen': 3494229, 'train_runtime': '1768', 'train_tokens_per_second': '1976'} +{'loss': '1.407', 'grad_norm': '3.501', 'learning_rate': '2.149e-05', 'epoch': '0.043', 'num_input_tokens_seen': 3496276, 'train_runtime': '1769', 'train_tokens_per_second': '1976'} +{'loss': '1.018', 'grad_norm': '2.056', 'learning_rate': '2.15e-05', 'epoch': '0.04303', 'num_input_tokens_seen': 3498323, 'train_runtime': '1770', 'train_tokens_per_second': '1976'} +{'loss': '0.8459', 'grad_norm': '2.35', 'learning_rate': '2.151e-05', 'epoch': '0.04305', 'num_input_tokens_seen': 3500370, 'train_runtime': '1771', 'train_tokens_per_second': '1976'} +{'loss': '0.9367', 'grad_norm': '2.369', 'learning_rate': '2.153e-05', 'epoch': '0.04308', 'num_input_tokens_seen': 3502417, 'train_runtime': '1772', 'train_tokens_per_second': '1976'} +{'loss': '0.7532', 'grad_norm': '1.735', 'learning_rate': '2.154e-05', 'epoch': '0.0431', 'num_input_tokens_seen': 3504464, 'train_runtime': '1773', 'train_tokens_per_second': '1976'} +{'loss': '0.5808', 'grad_norm': '1.596', 'learning_rate': '2.155e-05', 'epoch': '0.04313', 'num_input_tokens_seen': 3506511, 'train_runtime': '1774', 'train_tokens_per_second': '1976'} +{'loss': '1.511', 'grad_norm': '2.759', 'learning_rate': '2.156e-05', 'epoch': '0.04316', 'num_input_tokens_seen': 3508558, 'train_runtime': '1775', 'train_tokens_per_second': '1976'} +{'loss': '1.115', 'grad_norm': '2.155', 'learning_rate': '2.158e-05', 'epoch': '0.04318', 'num_input_tokens_seen': 3510605, 'train_runtime': '1776', 'train_tokens_per_second': '1976'} +{'loss': '0.9662', 'grad_norm': '2.05', 'learning_rate': '2.159e-05', 'epoch': '0.04321', 'num_input_tokens_seen': 3512652, 'train_runtime': '1777', 'train_tokens_per_second': '1976'} +{'loss': '3.263', 'grad_norm': '2.912', 'learning_rate': '2.16e-05', 'epoch': '0.04323', 'num_input_tokens_seen': 3514699, 'train_runtime': '1778', 'train_tokens_per_second': '1976'} +{'loss': '1.118', 'grad_norm': '2.156', 'learning_rate': '2.161e-05', 'epoch': '0.04326', 'num_input_tokens_seen': 3516746, 'train_runtime': '1779', 'train_tokens_per_second': '1976'} +{'loss': '1.637', 'grad_norm': '2.799', 'learning_rate': '2.163e-05', 'epoch': '0.04328', 'num_input_tokens_seen': 3518793, 'train_runtime': '1780', 'train_tokens_per_second': '1976'} +{'loss': '2.384', 'grad_norm': '3.737', 'learning_rate': '2.164e-05', 'epoch': '0.04331', 'num_input_tokens_seen': 3520840, 'train_runtime': '1781', 'train_tokens_per_second': '1976'} +{'loss': '0.9507', 'grad_norm': '2.796', 'learning_rate': '2.165e-05', 'epoch': '0.04333', 'num_input_tokens_seen': 3522887, 'train_runtime': '1782', 'train_tokens_per_second': '1976'} +{'loss': '0.9913', 'grad_norm': '2.826', 'learning_rate': '2.166e-05', 'epoch': '0.04336', 'num_input_tokens_seen': 3524934, 'train_runtime': '1783', 'train_tokens_per_second': '1976'} +{'loss': '1.885', 'grad_norm': '2.727', 'learning_rate': '2.168e-05', 'epoch': '0.04338', 'num_input_tokens_seen': 3526981, 'train_runtime': '1784', 'train_tokens_per_second': '1976'} +{'loss': '0.4177', 'grad_norm': '1.507', 'learning_rate': '2.169e-05', 'epoch': '0.04341', 'num_input_tokens_seen': 3529028, 'train_runtime': '1786', 'train_tokens_per_second': '1976'} +{'loss': '1.424', 'grad_norm': '2.43', 'learning_rate': '2.17e-05', 'epoch': '0.04343', 'num_input_tokens_seen': 3531075, 'train_runtime': '1787', 'train_tokens_per_second': '1976'} +{'loss': '1.447', 'grad_norm': '2.716', 'learning_rate': '2.171e-05', 'epoch': '0.04346', 'num_input_tokens_seen': 3533122, 'train_runtime': '1788', 'train_tokens_per_second': '1976'} +{'loss': '1.43', 'grad_norm': '2.533', 'learning_rate': '2.173e-05', 'epoch': '0.04348', 'num_input_tokens_seen': 3535169, 'train_runtime': '1789', 'train_tokens_per_second': '1976'} +{'loss': '2.08', 'grad_norm': '3.264', 'learning_rate': '2.174e-05', 'epoch': '0.04351', 'num_input_tokens_seen': 3537216, 'train_runtime': '1790', 'train_tokens_per_second': '1976'} +{'loss': '0.816', 'grad_norm': '2.028', 'learning_rate': '2.175e-05', 'epoch': '0.04353', 'num_input_tokens_seen': 3539263, 'train_runtime': '1791', 'train_tokens_per_second': '1976'} +{'loss': '0.521', 'grad_norm': '1.764', 'learning_rate': '2.176e-05', 'epoch': '0.04356', 'num_input_tokens_seen': 3541310, 'train_runtime': '1792', 'train_tokens_per_second': '1976'} +{'loss': '2.649', 'grad_norm': '3.514', 'learning_rate': '2.178e-05', 'epoch': '0.04358', 'num_input_tokens_seen': 3543357, 'train_runtime': '1793', 'train_tokens_per_second': '1976'} +{'loss': '2.659', 'grad_norm': '3.269', 'learning_rate': '2.179e-05', 'epoch': '0.04361', 'num_input_tokens_seen': 3545404, 'train_runtime': '1794', 'train_tokens_per_second': '1976'} +{'loss': '0.7882', 'grad_norm': '2.299', 'learning_rate': '2.18e-05', 'epoch': '0.04363', 'num_input_tokens_seen': 3547451, 'train_runtime': '1795', 'train_tokens_per_second': '1976'} +{'loss': '0.6568', 'grad_norm': '1.896', 'learning_rate': '2.182e-05', 'epoch': '0.04366', 'num_input_tokens_seen': 3549498, 'train_runtime': '1796', 'train_tokens_per_second': '1976'} +{'loss': '1.16', 'grad_norm': '2.657', 'learning_rate': '2.183e-05', 'epoch': '0.04368', 'num_input_tokens_seen': 3551545, 'train_runtime': '1797', 'train_tokens_per_second': '1976'} +{'loss': '2.019', 'grad_norm': '3.387', 'learning_rate': '2.184e-05', 'epoch': '0.04371', 'num_input_tokens_seen': 3553592, 'train_runtime': '1798', 'train_tokens_per_second': '1976'} +{'loss': '0.7375', 'grad_norm': '1.763', 'learning_rate': '2.185e-05', 'epoch': '0.04373', 'num_input_tokens_seen': 3555639, 'train_runtime': '1799', 'train_tokens_per_second': '1976'} +{'loss': '1.061', 'grad_norm': '2.201', 'learning_rate': '2.187e-05', 'epoch': '0.04376', 'num_input_tokens_seen': 3557686, 'train_runtime': '1800', 'train_tokens_per_second': '1976'} +{'loss': '1.107', 'grad_norm': '2.104', 'learning_rate': '2.188e-05', 'epoch': '0.04378', 'num_input_tokens_seen': 3559733, 'train_runtime': '1801', 'train_tokens_per_second': '1976'} +{'loss': '0.8443', 'grad_norm': '1.993', 'learning_rate': '2.189e-05', 'epoch': '0.04381', 'num_input_tokens_seen': 3561780, 'train_runtime': '1802', 'train_tokens_per_second': '1976'} +{'loss': '1.304', 'grad_norm': '2.58', 'learning_rate': '2.19e-05', 'epoch': '0.04384', 'num_input_tokens_seen': 3563827, 'train_runtime': '1803', 'train_tokens_per_second': '1976'} +{'loss': '1.948', 'grad_norm': '3.013', 'learning_rate': '2.192e-05', 'epoch': '0.04386', 'num_input_tokens_seen': 3565874, 'train_runtime': '1804', 'train_tokens_per_second': '1976'} +{'loss': '0.458', 'grad_norm': '1.551', 'learning_rate': '2.193e-05', 'epoch': '0.04389', 'num_input_tokens_seen': 3567921, 'train_runtime': '1805', 'train_tokens_per_second': '1976'} +{'loss': '1.914', 'grad_norm': '3.048', 'learning_rate': '2.194e-05', 'epoch': '0.04391', 'num_input_tokens_seen': 3569968, 'train_runtime': '1806', 'train_tokens_per_second': '1976'} +{'loss': '1.296', 'grad_norm': '2.315', 'learning_rate': '2.195e-05', 'epoch': '0.04394', 'num_input_tokens_seen': 3572015, 'train_runtime': '1807', 'train_tokens_per_second': '1976'} +{'loss': '1.085', 'grad_norm': '2.58', 'learning_rate': '2.197e-05', 'epoch': '0.04396', 'num_input_tokens_seen': 3574062, 'train_runtime': '1808', 'train_tokens_per_second': '1976'} +{'loss': '0.6698', 'grad_norm': '1.621', 'learning_rate': '2.198e-05', 'epoch': '0.04399', 'num_input_tokens_seen': 3576109, 'train_runtime': '1809', 'train_tokens_per_second': '1976'} +{'loss': '1.531', 'grad_norm': '2.011', 'learning_rate': '2.199e-05', 'epoch': '0.04401', 'num_input_tokens_seen': 3578156, 'train_runtime': '1810', 'train_tokens_per_second': '1977'} +{'loss': '0.8321', 'grad_norm': '1.648', 'learning_rate': '2.2e-05', 'epoch': '0.04404', 'num_input_tokens_seen': 3580203, 'train_runtime': '1811', 'train_tokens_per_second': '1977'} +{'loss': '0.8263', 'grad_norm': '1.826', 'learning_rate': '2.202e-05', 'epoch': '0.04406', 'num_input_tokens_seen': 3582250, 'train_runtime': '1812', 'train_tokens_per_second': '1977'} +{'loss': '0.8032', 'grad_norm': '2.091', 'learning_rate': '2.203e-05', 'epoch': '0.04409', 'num_input_tokens_seen': 3584297, 'train_runtime': '1813', 'train_tokens_per_second': '1977'} +{'loss': '1.826', 'grad_norm': '2.641', 'learning_rate': '2.204e-05', 'epoch': '0.04411', 'num_input_tokens_seen': 3586344, 'train_runtime': '1814', 'train_tokens_per_second': '1977'} +{'loss': '0.7835', 'grad_norm': '1.903', 'learning_rate': '2.205e-05', 'epoch': '0.04414', 'num_input_tokens_seen': 3588391, 'train_runtime': '1816', 'train_tokens_per_second': '1977'} +{'loss': '0.4428', 'grad_norm': '1.256', 'learning_rate': '2.207e-05', 'epoch': '0.04416', 'num_input_tokens_seen': 3590438, 'train_runtime': '1817', 'train_tokens_per_second': '1977'} +{'loss': '1.143', 'grad_norm': '2.086', 'learning_rate': '2.208e-05', 'epoch': '0.04419', 'num_input_tokens_seen': 3592485, 'train_runtime': '1818', 'train_tokens_per_second': '1977'} +{'loss': '0.4302', 'grad_norm': '1.48', 'learning_rate': '2.209e-05', 'epoch': '0.04421', 'num_input_tokens_seen': 3594532, 'train_runtime': '1819', 'train_tokens_per_second': '1977'} +{'loss': '0.9056', 'grad_norm': '2.087', 'learning_rate': '2.21e-05', 'epoch': '0.04424', 'num_input_tokens_seen': 3596579, 'train_runtime': '1820', 'train_tokens_per_second': '1977'} +{'loss': '1.849', 'grad_norm': '4.042', 'learning_rate': '2.212e-05', 'epoch': '0.04426', 'num_input_tokens_seen': 3598626, 'train_runtime': '1821', 'train_tokens_per_second': '1977'} +{'loss': '0.9101', 'grad_norm': '2.557', 'learning_rate': '2.213e-05', 'epoch': '0.04429', 'num_input_tokens_seen': 3600673, 'train_runtime': '1822', 'train_tokens_per_second': '1977'} +{'loss': '0.6041', 'grad_norm': '1.926', 'learning_rate': '2.214e-05', 'epoch': '0.04431', 'num_input_tokens_seen': 3602720, 'train_runtime': '1823', 'train_tokens_per_second': '1977'} +{'loss': '1.439', 'grad_norm': '2.871', 'learning_rate': '2.216e-05', 'epoch': '0.04434', 'num_input_tokens_seen': 3604767, 'train_runtime': '1824', 'train_tokens_per_second': '1977'} +{'loss': '0.709', 'grad_norm': '2.188', 'learning_rate': '2.217e-05', 'epoch': '0.04436', 'num_input_tokens_seen': 3606814, 'train_runtime': '1825', 'train_tokens_per_second': '1977'} +{'loss': '0.5694', 'grad_norm': '1.909', 'learning_rate': '2.218e-05', 'epoch': '0.04439', 'num_input_tokens_seen': 3608861, 'train_runtime': '1826', 'train_tokens_per_second': '1977'} +{'loss': '0.4914', 'grad_norm': '1.561', 'learning_rate': '2.219e-05', 'epoch': '0.04441', 'num_input_tokens_seen': 3610908, 'train_runtime': '1827', 'train_tokens_per_second': '1977'} +{'loss': '0.6992', 'grad_norm': '2.229', 'learning_rate': '2.221e-05', 'epoch': '0.04444', 'num_input_tokens_seen': 3612955, 'train_runtime': '1828', 'train_tokens_per_second': '1977'} +{'loss': '1.546', 'grad_norm': '2.764', 'learning_rate': '2.222e-05', 'epoch': '0.04446', 'num_input_tokens_seen': 3615002, 'train_runtime': '1829', 'train_tokens_per_second': '1977'} +{'loss': '0.6145', 'grad_norm': '1.903', 'learning_rate': '2.223e-05', 'epoch': '0.04449', 'num_input_tokens_seen': 3617049, 'train_runtime': '1830', 'train_tokens_per_second': '1977'} +{'loss': '0.4939', 'grad_norm': '1.323', 'learning_rate': '2.224e-05', 'epoch': '0.04451', 'num_input_tokens_seen': 3619096, 'train_runtime': '1831', 'train_tokens_per_second': '1977'} +{'loss': '0.8292', 'grad_norm': '2.003', 'learning_rate': '2.226e-05', 'epoch': '0.04454', 'num_input_tokens_seen': 3621143, 'train_runtime': '1832', 'train_tokens_per_second': '1977'} +{'loss': '1.421', 'grad_norm': '2.463', 'learning_rate': '2.227e-05', 'epoch': '0.04457', 'num_input_tokens_seen': 3623190, 'train_runtime': '1833', 'train_tokens_per_second': '1977'} +{'loss': '2.798', 'grad_norm': '3.381', 'learning_rate': '2.228e-05', 'epoch': '0.04459', 'num_input_tokens_seen': 3625237, 'train_runtime': '1834', 'train_tokens_per_second': '1977'} +{'loss': '0.9919', 'grad_norm': '2.049', 'learning_rate': '2.229e-05', 'epoch': '0.04462', 'num_input_tokens_seen': 3627284, 'train_runtime': '1835', 'train_tokens_per_second': '1977'} +{'loss': '1.849', 'grad_norm': '3.19', 'learning_rate': '2.231e-05', 'epoch': '0.04464', 'num_input_tokens_seen': 3629331, 'train_runtime': '1836', 'train_tokens_per_second': '1977'} +{'loss': '0.8417', 'grad_norm': '1.944', 'learning_rate': '2.232e-05', 'epoch': '0.04467', 'num_input_tokens_seen': 3631378, 'train_runtime': '1837', 'train_tokens_per_second': '1977'} +{'loss': '0.9082', 'grad_norm': '2.108', 'learning_rate': '2.233e-05', 'epoch': '0.04469', 'num_input_tokens_seen': 3633425, 'train_runtime': '1838', 'train_tokens_per_second': '1977'} +{'loss': '1.22', 'grad_norm': '2.596', 'learning_rate': '2.234e-05', 'epoch': '0.04472', 'num_input_tokens_seen': 3635472, 'train_runtime': '1839', 'train_tokens_per_second': '1977'} +{'loss': '0.9201', 'grad_norm': '2.049', 'learning_rate': '2.236e-05', 'epoch': '0.04474', 'num_input_tokens_seen': 3637519, 'train_runtime': '1840', 'train_tokens_per_second': '1977'} +{'loss': '0.3973', 'grad_norm': '1.486', 'learning_rate': '2.237e-05', 'epoch': '0.04477', 'num_input_tokens_seen': 3639566, 'train_runtime': '1841', 'train_tokens_per_second': '1977'} +{'loss': '1.907', 'grad_norm': '2.986', 'learning_rate': '2.238e-05', 'epoch': '0.04479', 'num_input_tokens_seen': 3641613, 'train_runtime': '1842', 'train_tokens_per_second': '1977'} +{'loss': '1.459', 'grad_norm': '3.047', 'learning_rate': '2.239e-05', 'epoch': '0.04482', 'num_input_tokens_seen': 3643660, 'train_runtime': '1843', 'train_tokens_per_second': '1977'} +{'loss': '0.6411', 'grad_norm': '1.807', 'learning_rate': '2.241e-05', 'epoch': '0.04484', 'num_input_tokens_seen': 3645707, 'train_runtime': '1844', 'train_tokens_per_second': '1977'} +{'loss': '1.433', 'grad_norm': '3.022', 'learning_rate': '2.242e-05', 'epoch': '0.04487', 'num_input_tokens_seen': 3647754, 'train_runtime': '1846', 'train_tokens_per_second': '1977'} +{'loss': '1.909', 'grad_norm': '3.105', 'learning_rate': '2.243e-05', 'epoch': '0.04489', 'num_input_tokens_seen': 3649801, 'train_runtime': '1847', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '2.465', 'learning_rate': '2.244e-05', 'epoch': '0.04492', 'num_input_tokens_seen': 3651848, 'train_runtime': '1848', 'train_tokens_per_second': '1977'} +{'loss': '0.4589', 'grad_norm': '1.913', 'learning_rate': '2.246e-05', 'epoch': '0.04494', 'num_input_tokens_seen': 3653895, 'train_runtime': '1849', 'train_tokens_per_second': '1977'} +{'loss': '1.915', 'grad_norm': '2.635', 'learning_rate': '2.247e-05', 'epoch': '0.04497', 'num_input_tokens_seen': 3655942, 'train_runtime': '1850', 'train_tokens_per_second': '1977'} +{'loss': '0.6084', 'grad_norm': '2.148', 'learning_rate': '2.248e-05', 'epoch': '0.04499', 'num_input_tokens_seen': 3657989, 'train_runtime': '1851', 'train_tokens_per_second': '1977'} +{'loss': '1.371', 'grad_norm': '2.892', 'learning_rate': '2.249e-05', 'epoch': '0.04502', 'num_input_tokens_seen': 3660036, 'train_runtime': '1852', 'train_tokens_per_second': '1977'} +{'loss': '0.486', 'grad_norm': '1.439', 'learning_rate': '2.251e-05', 'epoch': '0.04504', 'num_input_tokens_seen': 3662083, 'train_runtime': '1853', 'train_tokens_per_second': '1977'} +{'loss': '0.6078', 'grad_norm': '1.883', 'learning_rate': '2.252e-05', 'epoch': '0.04507', 'num_input_tokens_seen': 3664130, 'train_runtime': '1854', 'train_tokens_per_second': '1977'} +{'loss': '1.867', 'grad_norm': '2.935', 'learning_rate': '2.253e-05', 'epoch': '0.04509', 'num_input_tokens_seen': 3666177, 'train_runtime': '1855', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '2.214', 'learning_rate': '2.255e-05', 'epoch': '0.04512', 'num_input_tokens_seen': 3668224, 'train_runtime': '1856', 'train_tokens_per_second': '1977'} +{'loss': '1.021', 'grad_norm': '2.295', 'learning_rate': '2.256e-05', 'epoch': '0.04514', 'num_input_tokens_seen': 3670271, 'train_runtime': '1857', 'train_tokens_per_second': '1977'} +{'loss': '1.229', 'grad_norm': '2.571', 'learning_rate': '2.257e-05', 'epoch': '0.04517', 'num_input_tokens_seen': 3672318, 'train_runtime': '1858', 'train_tokens_per_second': '1977'} +{'loss': '0.3846', 'grad_norm': '1.486', 'learning_rate': '2.258e-05', 'epoch': '0.04519', 'num_input_tokens_seen': 3674365, 'train_runtime': '1859', 'train_tokens_per_second': '1977'} +{'loss': '1.332', 'grad_norm': '2.707', 'learning_rate': '2.26e-05', 'epoch': '0.04522', 'num_input_tokens_seen': 3676412, 'train_runtime': '1860', 'train_tokens_per_second': '1977'} +{'loss': '2.463', 'grad_norm': '3.731', 'learning_rate': '2.261e-05', 'epoch': '0.04525', 'num_input_tokens_seen': 3678459, 'train_runtime': '1861', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '3.189', 'learning_rate': '2.262e-05', 'epoch': '0.04527', 'num_input_tokens_seen': 3680506, 'train_runtime': '1862', 'train_tokens_per_second': '1977'} +{'loss': '1.254', 'grad_norm': '2.722', 'learning_rate': '2.263e-05', 'epoch': '0.0453', 'num_input_tokens_seen': 3682553, 'train_runtime': '1863', 'train_tokens_per_second': '1977'} +{'loss': '0.8351', 'grad_norm': '3.121', 'learning_rate': '2.265e-05', 'epoch': '0.04532', 'num_input_tokens_seen': 3684600, 'train_runtime': '1864', 'train_tokens_per_second': '1977'} +{'loss': '0.5029', 'grad_norm': '1.548', 'learning_rate': '2.266e-05', 'epoch': '0.04535', 'num_input_tokens_seen': 3686647, 'train_runtime': '1865', 'train_tokens_per_second': '1977'} +{'loss': '1.061', 'grad_norm': '2.714', 'learning_rate': '2.267e-05', 'epoch': '0.04537', 'num_input_tokens_seen': 3688694, 'train_runtime': '1866', 'train_tokens_per_second': '1977'} +{'loss': '0.4882', 'grad_norm': '1.672', 'learning_rate': '2.268e-05', 'epoch': '0.0454', 'num_input_tokens_seen': 3690741, 'train_runtime': '1867', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '2.195', 'learning_rate': '2.27e-05', 'epoch': '0.04542', 'num_input_tokens_seen': 3692788, 'train_runtime': '1868', 'train_tokens_per_second': '1977'} +{'loss': '2.739', 'grad_norm': '3.286', 'learning_rate': '2.271e-05', 'epoch': '0.04545', 'num_input_tokens_seen': 3694835, 'train_runtime': '1869', 'train_tokens_per_second': '1977'} +{'loss': '1.596', 'grad_norm': '2.584', 'learning_rate': '2.272e-05', 'epoch': '0.04547', 'num_input_tokens_seen': 3696882, 'train_runtime': '1870', 'train_tokens_per_second': '1977'} +{'loss': '0.4675', 'grad_norm': '1.908', 'learning_rate': '2.273e-05', 'epoch': '0.0455', 'num_input_tokens_seen': 3698929, 'train_runtime': '1871', 'train_tokens_per_second': '1977'} +{'loss': '0.4531', 'grad_norm': '1.746', 'learning_rate': '2.275e-05', 'epoch': '0.04552', 'num_input_tokens_seen': 3700976, 'train_runtime': '1872', 'train_tokens_per_second': '1977'} +{'loss': '1.301', 'grad_norm': '2.524', 'learning_rate': '2.276e-05', 'epoch': '0.04555', 'num_input_tokens_seen': 3703023, 'train_runtime': '1873', 'train_tokens_per_second': '1977'} +{'loss': '0.5679', 'grad_norm': '1.679', 'learning_rate': '2.277e-05', 'epoch': '0.04557', 'num_input_tokens_seen': 3705070, 'train_runtime': '1874', 'train_tokens_per_second': '1977'} +{'loss': '0.9499', 'grad_norm': '2.082', 'learning_rate': '2.278e-05', 'epoch': '0.0456', 'num_input_tokens_seen': 3707117, 'train_runtime': '1876', 'train_tokens_per_second': '1977'} +{'loss': '1.504', 'grad_norm': '2.86', 'learning_rate': '2.28e-05', 'epoch': '0.04562', 'num_input_tokens_seen': 3709164, 'train_runtime': '1877', 'train_tokens_per_second': '1977'} +{'loss': '0.8234', 'grad_norm': '2.351', 'learning_rate': '2.281e-05', 'epoch': '0.04565', 'num_input_tokens_seen': 3711211, 'train_runtime': '1878', 'train_tokens_per_second': '1977'} +{'loss': '1.093', 'grad_norm': '2.631', 'learning_rate': '2.282e-05', 'epoch': '0.04567', 'num_input_tokens_seen': 3713258, 'train_runtime': '1879', 'train_tokens_per_second': '1977'} +{'loss': '1.581', 'grad_norm': '2.438', 'learning_rate': '2.283e-05', 'epoch': '0.0457', 'num_input_tokens_seen': 3715305, 'train_runtime': '1880', 'train_tokens_per_second': '1977'} +{'loss': '0.3994', 'grad_norm': '1.707', 'learning_rate': '2.285e-05', 'epoch': '0.04572', 'num_input_tokens_seen': 3717352, 'train_runtime': '1881', 'train_tokens_per_second': '1977'} +{'loss': '0.9012', 'grad_norm': '2.115', 'learning_rate': '2.286e-05', 'epoch': '0.04575', 'num_input_tokens_seen': 3719399, 'train_runtime': '1882', 'train_tokens_per_second': '1977'} +{'loss': '0.8092', 'grad_norm': '2.34', 'learning_rate': '2.287e-05', 'epoch': '0.04577', 'num_input_tokens_seen': 3721446, 'train_runtime': '1883', 'train_tokens_per_second': '1977'} +{'loss': '1.116', 'grad_norm': '2.3', 'learning_rate': '2.289e-05', 'epoch': '0.0458', 'num_input_tokens_seen': 3723493, 'train_runtime': '1884', 'train_tokens_per_second': '1977'} +{'loss': '1.04', 'grad_norm': '2.259', 'learning_rate': '2.29e-05', 'epoch': '0.04582', 'num_input_tokens_seen': 3725540, 'train_runtime': '1885', 'train_tokens_per_second': '1977'} +{'loss': '1.51', 'grad_norm': '3.255', 'learning_rate': '2.291e-05', 'epoch': '0.04585', 'num_input_tokens_seen': 3727587, 'train_runtime': '1886', 'train_tokens_per_second': '1977'} +{'loss': '1.254', 'grad_norm': '2.572', 'learning_rate': '2.292e-05', 'epoch': '0.04587', 'num_input_tokens_seen': 3729634, 'train_runtime': '1887', 'train_tokens_per_second': '1977'} +{'loss': '0.5099', 'grad_norm': '1.944', 'learning_rate': '2.294e-05', 'epoch': '0.0459', 'num_input_tokens_seen': 3731681, 'train_runtime': '1888', 'train_tokens_per_second': '1977'} +{'loss': '0.7085', 'grad_norm': '1.653', 'learning_rate': '2.295e-05', 'epoch': '0.04592', 'num_input_tokens_seen': 3733728, 'train_runtime': '1889', 'train_tokens_per_second': '1977'} +{'loss': '0.9076', 'grad_norm': '2.17', 'learning_rate': '2.296e-05', 'epoch': '0.04595', 'num_input_tokens_seen': 3735775, 'train_runtime': '1890', 'train_tokens_per_second': '1977'} +{'loss': '1.592', 'grad_norm': '2.844', 'learning_rate': '2.297e-05', 'epoch': '0.04598', 'num_input_tokens_seen': 3737822, 'train_runtime': '1891', 'train_tokens_per_second': '1977'} +{'loss': '1.214', 'grad_norm': '2.974', 'learning_rate': '2.299e-05', 'epoch': '0.046', 'num_input_tokens_seen': 3739869, 'train_runtime': '1892', 'train_tokens_per_second': '1977'} +{'loss': '1.482', 'grad_norm': '2.977', 'learning_rate': '2.3e-05', 'epoch': '0.04603', 'num_input_tokens_seen': 3741916, 'train_runtime': '1893', 'train_tokens_per_second': '1977'} +{'loss': '2.395', 'grad_norm': '3.314', 'learning_rate': '2.301e-05', 'epoch': '0.04605', 'num_input_tokens_seen': 3743963, 'train_runtime': '1894', 'train_tokens_per_second': '1977'} +{'loss': '0.3476', 'grad_norm': '1.435', 'learning_rate': '2.302e-05', 'epoch': '0.04608', 'num_input_tokens_seen': 3746010, 'train_runtime': '1895', 'train_tokens_per_second': '1977'} +{'loss': '1.786', 'grad_norm': '3.612', 'learning_rate': '2.304e-05', 'epoch': '0.0461', 'num_input_tokens_seen': 3748057, 'train_runtime': '1896', 'train_tokens_per_second': '1977'} +{'loss': '0.5982', 'grad_norm': '1.957', 'learning_rate': '2.305e-05', 'epoch': '0.04613', 'num_input_tokens_seen': 3750104, 'train_runtime': '1897', 'train_tokens_per_second': '1977'} +{'loss': '0.6558', 'grad_norm': '2.231', 'learning_rate': '2.306e-05', 'epoch': '0.04615', 'num_input_tokens_seen': 3752151, 'train_runtime': '1898', 'train_tokens_per_second': '1977'} +{'loss': '1.729', 'grad_norm': '3.188', 'learning_rate': '2.307e-05', 'epoch': '0.04618', 'num_input_tokens_seen': 3754198, 'train_runtime': '1899', 'train_tokens_per_second': '1977'} +{'loss': '0.8835', 'grad_norm': '2.731', 'learning_rate': '2.309e-05', 'epoch': '0.0462', 'num_input_tokens_seen': 3756245, 'train_runtime': '1900', 'train_tokens_per_second': '1977'} +{'loss': '0.3683', 'grad_norm': '1.54', 'learning_rate': '2.31e-05', 'epoch': '0.04623', 'num_input_tokens_seen': 3758292, 'train_runtime': '1901', 'train_tokens_per_second': '1977'} +{'loss': '0.4424', 'grad_norm': '1.707', 'learning_rate': '2.311e-05', 'epoch': '0.04625', 'num_input_tokens_seen': 3760339, 'train_runtime': '1902', 'train_tokens_per_second': '1977'} +{'loss': '0.4575', 'grad_norm': '1.744', 'learning_rate': '2.312e-05', 'epoch': '0.04628', 'num_input_tokens_seen': 3762386, 'train_runtime': '1903', 'train_tokens_per_second': '1977'} +{'loss': '1.521', 'grad_norm': '2.584', 'learning_rate': '2.314e-05', 'epoch': '0.0463', 'num_input_tokens_seen': 3764433, 'train_runtime': '1904', 'train_tokens_per_second': '1977'} +{'loss': '0.9475', 'grad_norm': '2.273', 'learning_rate': '2.315e-05', 'epoch': '0.04633', 'num_input_tokens_seen': 3766480, 'train_runtime': '1906', 'train_tokens_per_second': '1977'} +{'loss': '0.7326', 'grad_norm': '2.001', 'learning_rate': '2.316e-05', 'epoch': '0.04635', 'num_input_tokens_seen': 3768527, 'train_runtime': '1907', 'train_tokens_per_second': '1977'} +{'loss': '2.275', 'grad_norm': '3.395', 'learning_rate': '2.317e-05', 'epoch': '0.04638', 'num_input_tokens_seen': 3770574, 'train_runtime': '1908', 'train_tokens_per_second': '1977'} +{'loss': '1.38', 'grad_norm': '2.628', 'learning_rate': '2.319e-05', 'epoch': '0.0464', 'num_input_tokens_seen': 3772621, 'train_runtime': '1909', 'train_tokens_per_second': '1977'} +{'loss': '0.8242', 'grad_norm': '2.939', 'learning_rate': '2.32e-05', 'epoch': '0.04643', 'num_input_tokens_seen': 3774668, 'train_runtime': '1910', 'train_tokens_per_second': '1977'} +{'loss': '0.9423', 'grad_norm': '2.269', 'learning_rate': '2.321e-05', 'epoch': '0.04645', 'num_input_tokens_seen': 3776715, 'train_runtime': '1911', 'train_tokens_per_second': '1977'} +{'loss': '1.27', 'grad_norm': '2.679', 'learning_rate': '2.323e-05', 'epoch': '0.04648', 'num_input_tokens_seen': 3778762, 'train_runtime': '1912', 'train_tokens_per_second': '1977'} +{'loss': '0.501', 'grad_norm': '1.966', 'learning_rate': '2.324e-05', 'epoch': '0.0465', 'num_input_tokens_seen': 3780809, 'train_runtime': '1913', 'train_tokens_per_second': '1977'} +{'loss': '0.4555', 'grad_norm': '1.743', 'learning_rate': '2.325e-05', 'epoch': '0.04653', 'num_input_tokens_seen': 3782856, 'train_runtime': '1914', 'train_tokens_per_second': '1977'} +{'loss': '1.205', 'grad_norm': '2.043', 'learning_rate': '2.326e-05', 'epoch': '0.04655', 'num_input_tokens_seen': 3784903, 'train_runtime': '1915', 'train_tokens_per_second': '1977'} +{'loss': '1.155', 'grad_norm': '2.33', 'learning_rate': '2.328e-05', 'epoch': '0.04658', 'num_input_tokens_seen': 3786950, 'train_runtime': '1916', 'train_tokens_per_second': '1977'} +{'loss': '0.7369', 'grad_norm': '2.16', 'learning_rate': '2.329e-05', 'epoch': '0.0466', 'num_input_tokens_seen': 3788997, 'train_runtime': '1917', 'train_tokens_per_second': '1977'} +{'loss': '1.173', 'grad_norm': '3.286', 'learning_rate': '2.33e-05', 'epoch': '0.04663', 'num_input_tokens_seen': 3791044, 'train_runtime': '1918', 'train_tokens_per_second': '1977'} +{'loss': '0.9294', 'grad_norm': '2.56', 'learning_rate': '2.331e-05', 'epoch': '0.04666', 'num_input_tokens_seen': 3793091, 'train_runtime': '1919', 'train_tokens_per_second': '1977'} +{'loss': '0.7232', 'grad_norm': '1.816', 'learning_rate': '2.333e-05', 'epoch': '0.04668', 'num_input_tokens_seen': 3795138, 'train_runtime': '1920', 'train_tokens_per_second': '1977'} +{'loss': '1.83', 'grad_norm': '2.9', 'learning_rate': '2.334e-05', 'epoch': '0.04671', 'num_input_tokens_seen': 3797185, 'train_runtime': '1921', 'train_tokens_per_second': '1977'} +{'loss': '1.103', 'grad_norm': '1.921', 'learning_rate': '2.335e-05', 'epoch': '0.04673', 'num_input_tokens_seen': 3799232, 'train_runtime': '1922', 'train_tokens_per_second': '1977'} +{'loss': '0.7467', 'grad_norm': '2.333', 'learning_rate': '2.336e-05', 'epoch': '0.04676', 'num_input_tokens_seen': 3801279, 'train_runtime': '1923', 'train_tokens_per_second': '1977'} +{'loss': '1.051', 'grad_norm': '2.482', 'learning_rate': '2.338e-05', 'epoch': '0.04678', 'num_input_tokens_seen': 3803326, 'train_runtime': '1924', 'train_tokens_per_second': '1977'} +{'loss': '0.8582', 'grad_norm': '2.045', 'learning_rate': '2.339e-05', 'epoch': '0.04681', 'num_input_tokens_seen': 3805373, 'train_runtime': '1925', 'train_tokens_per_second': '1977'} +{'loss': '1.889', 'grad_norm': '3.173', 'learning_rate': '2.34e-05', 'epoch': '0.04683', 'num_input_tokens_seen': 3807420, 'train_runtime': '1926', 'train_tokens_per_second': '1977'} +{'loss': '0.4914', 'grad_norm': '1.741', 'learning_rate': '2.341e-05', 'epoch': '0.04686', 'num_input_tokens_seen': 3809467, 'train_runtime': '1927', 'train_tokens_per_second': '1977'} +{'loss': '1.375', 'grad_norm': '2.603', 'learning_rate': '2.343e-05', 'epoch': '0.04688', 'num_input_tokens_seen': 3811514, 'train_runtime': '1928', 'train_tokens_per_second': '1977'} +{'loss': '1.466', 'grad_norm': '2.755', 'learning_rate': '2.344e-05', 'epoch': '0.04691', 'num_input_tokens_seen': 3813561, 'train_runtime': '1929', 'train_tokens_per_second': '1977'} +{'loss': '1.733', 'grad_norm': '2.534', 'learning_rate': '2.345e-05', 'epoch': '0.04693', 'num_input_tokens_seen': 3815608, 'train_runtime': '1930', 'train_tokens_per_second': '1977'} +{'loss': '1.528', 'grad_norm': '2.853', 'learning_rate': '2.346e-05', 'epoch': '0.04696', 'num_input_tokens_seen': 3817655, 'train_runtime': '1931', 'train_tokens_per_second': '1977'} +{'loss': '0.6119', 'grad_norm': '1.712', 'learning_rate': '2.348e-05', 'epoch': '0.04698', 'num_input_tokens_seen': 3819702, 'train_runtime': '1932', 'train_tokens_per_second': '1977'} +{'loss': '0.5824', 'grad_norm': '1.742', 'learning_rate': '2.349e-05', 'epoch': '0.04701', 'num_input_tokens_seen': 3821749, 'train_runtime': '1933', 'train_tokens_per_second': '1977'} +{'loss': '0.7602', 'grad_norm': '3.688', 'learning_rate': '2.35e-05', 'epoch': '0.04703', 'num_input_tokens_seen': 3823796, 'train_runtime': '1935', 'train_tokens_per_second': '1977'} +{'loss': '0.8211', 'grad_norm': '2.1', 'learning_rate': '2.351e-05', 'epoch': '0.04706', 'num_input_tokens_seen': 3825843, 'train_runtime': '1936', 'train_tokens_per_second': '1977'} +{'loss': '0.7078', 'grad_norm': '1.955', 'learning_rate': '2.353e-05', 'epoch': '0.04708', 'num_input_tokens_seen': 3827890, 'train_runtime': '1937', 'train_tokens_per_second': '1977'} +{'loss': '0.5427', 'grad_norm': '1.701', 'learning_rate': '2.354e-05', 'epoch': '0.04711', 'num_input_tokens_seen': 3829937, 'train_runtime': '1938', 'train_tokens_per_second': '1977'} +{'loss': '0.4518', 'grad_norm': '1.957', 'learning_rate': '2.355e-05', 'epoch': '0.04713', 'num_input_tokens_seen': 3831984, 'train_runtime': '1939', 'train_tokens_per_second': '1977'} +{'loss': '0.9242', 'grad_norm': '2.517', 'learning_rate': '2.356e-05', 'epoch': '0.04716', 'num_input_tokens_seen': 3834031, 'train_runtime': '1940', 'train_tokens_per_second': '1977'} +{'loss': '0.6243', 'grad_norm': '1.865', 'learning_rate': '2.358e-05', 'epoch': '0.04718', 'num_input_tokens_seen': 3836078, 'train_runtime': '1941', 'train_tokens_per_second': '1977'} +{'loss': '0.4223', 'grad_norm': '1.433', 'learning_rate': '2.359e-05', 'epoch': '0.04721', 'num_input_tokens_seen': 3838125, 'train_runtime': '1942', 'train_tokens_per_second': '1977'} +{'loss': '0.9176', 'grad_norm': '2.46', 'learning_rate': '2.36e-05', 'epoch': '0.04723', 'num_input_tokens_seen': 3840172, 'train_runtime': '1943', 'train_tokens_per_second': '1977'} +{'loss': '1.53', 'grad_norm': '3.203', 'learning_rate': '2.362e-05', 'epoch': '0.04726', 'num_input_tokens_seen': 3842219, 'train_runtime': '1944', 'train_tokens_per_second': '1977'} +{'loss': '1.523', 'grad_norm': '2.859', 'learning_rate': '2.363e-05', 'epoch': '0.04728', 'num_input_tokens_seen': 3844266, 'train_runtime': '1945', 'train_tokens_per_second': '1977'} +{'loss': '0.5313', 'grad_norm': '1.574', 'learning_rate': '2.364e-05', 'epoch': '0.04731', 'num_input_tokens_seen': 3846313, 'train_runtime': '1946', 'train_tokens_per_second': '1977'} +{'loss': '1.395', 'grad_norm': '3.608', 'learning_rate': '2.365e-05', 'epoch': '0.04733', 'num_input_tokens_seen': 3848360, 'train_runtime': '1947', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '2.731', 'learning_rate': '2.367e-05', 'epoch': '0.04736', 'num_input_tokens_seen': 3850407, 'train_runtime': '1948', 'train_tokens_per_second': '1977'} +{'loss': '0.9207', 'grad_norm': '2.075', 'learning_rate': '2.368e-05', 'epoch': '0.04739', 'num_input_tokens_seen': 3852454, 'train_runtime': '1949', 'train_tokens_per_second': '1977'} +{'loss': '1.899', 'grad_norm': '3.344', 'learning_rate': '2.369e-05', 'epoch': '0.04741', 'num_input_tokens_seen': 3854501, 'train_runtime': '1950', 'train_tokens_per_second': '1977'} +{'loss': '1.082', 'grad_norm': '2.858', 'learning_rate': '2.37e-05', 'epoch': '0.04744', 'num_input_tokens_seen': 3856548, 'train_runtime': '1951', 'train_tokens_per_second': '1977'} +{'loss': '0.7254', 'grad_norm': '1.918', 'learning_rate': '2.372e-05', 'epoch': '0.04746', 'num_input_tokens_seen': 3858595, 'train_runtime': '1952', 'train_tokens_per_second': '1977'} +{'loss': '1.39', 'grad_norm': '2.967', 'learning_rate': '2.373e-05', 'epoch': '0.04749', 'num_input_tokens_seen': 3860642, 'train_runtime': '1953', 'train_tokens_per_second': '1977'} +{'loss': '0.5535', 'grad_norm': '1.638', 'learning_rate': '2.374e-05', 'epoch': '0.04751', 'num_input_tokens_seen': 3862689, 'train_runtime': '1954', 'train_tokens_per_second': '1977'} +{'loss': '0.4686', 'grad_norm': '1.873', 'learning_rate': '2.375e-05', 'epoch': '0.04754', 'num_input_tokens_seen': 3864736, 'train_runtime': '1955', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '3.446', 'learning_rate': '2.377e-05', 'epoch': '0.04756', 'num_input_tokens_seen': 3866783, 'train_runtime': '1956', 'train_tokens_per_second': '1977'} +{'loss': '0.4076', 'grad_norm': '1.852', 'learning_rate': '2.378e-05', 'epoch': '0.04759', 'num_input_tokens_seen': 3868830, 'train_runtime': '1957', 'train_tokens_per_second': '1977'} +{'loss': '1.704', 'grad_norm': '2.672', 'learning_rate': '2.379e-05', 'epoch': '0.04761', 'num_input_tokens_seen': 3870877, 'train_runtime': '1958', 'train_tokens_per_second': '1977'} +{'loss': '0.6424', 'grad_norm': '2.315', 'learning_rate': '2.38e-05', 'epoch': '0.04764', 'num_input_tokens_seen': 3872924, 'train_runtime': '1959', 'train_tokens_per_second': '1977'} +{'loss': '0.6451', 'grad_norm': '2.295', 'learning_rate': '2.382e-05', 'epoch': '0.04766', 'num_input_tokens_seen': 3874971, 'train_runtime': '1960', 'train_tokens_per_second': '1977'} +{'loss': '0.7164', 'grad_norm': '2.089', 'learning_rate': '2.383e-05', 'epoch': '0.04769', 'num_input_tokens_seen': 3877018, 'train_runtime': '1961', 'train_tokens_per_second': '1977'} +{'loss': '1.077', 'grad_norm': '2.813', 'learning_rate': '2.384e-05', 'epoch': '0.04771', 'num_input_tokens_seen': 3879065, 'train_runtime': '1962', 'train_tokens_per_second': '1977'} +{'loss': '0.4258', 'grad_norm': '1.884', 'learning_rate': '2.385e-05', 'epoch': '0.04774', 'num_input_tokens_seen': 3881112, 'train_runtime': '1963', 'train_tokens_per_second': '1977'} +{'loss': '1.996', 'grad_norm': '3.156', 'learning_rate': '2.387e-05', 'epoch': '0.04776', 'num_input_tokens_seen': 3883159, 'train_runtime': '1965', 'train_tokens_per_second': '1977'} +{'loss': '0.8825', 'grad_norm': '3.004', 'learning_rate': '2.388e-05', 'epoch': '0.04779', 'num_input_tokens_seen': 3885206, 'train_runtime': '1966', 'train_tokens_per_second': '1977'} +{'loss': '1.231', 'grad_norm': '1.942', 'learning_rate': '2.389e-05', 'epoch': '0.04781', 'num_input_tokens_seen': 3887253, 'train_runtime': '1967', 'train_tokens_per_second': '1977'} +{'loss': '1.609', 'grad_norm': '2.842', 'learning_rate': '2.39e-05', 'epoch': '0.04784', 'num_input_tokens_seen': 3889300, 'train_runtime': '1968', 'train_tokens_per_second': '1977'} +{'loss': '1.155', 'grad_norm': '2.85', 'learning_rate': '2.392e-05', 'epoch': '0.04786', 'num_input_tokens_seen': 3891347, 'train_runtime': '1969', 'train_tokens_per_second': '1977'} +{'loss': '0.8102', 'grad_norm': '1.929', 'learning_rate': '2.393e-05', 'epoch': '0.04789', 'num_input_tokens_seen': 3893394, 'train_runtime': '1970', 'train_tokens_per_second': '1977'} +{'loss': '1.721', 'grad_norm': '3.197', 'learning_rate': '2.394e-05', 'epoch': '0.04791', 'num_input_tokens_seen': 3895441, 'train_runtime': '1971', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '3.554', 'learning_rate': '2.396e-05', 'epoch': '0.04794', 'num_input_tokens_seen': 3897488, 'train_runtime': '1972', 'train_tokens_per_second': '1977'} +{'loss': '2.213', 'grad_norm': '3.178', 'learning_rate': '2.397e-05', 'epoch': '0.04796', 'num_input_tokens_seen': 3899535, 'train_runtime': '1973', 'train_tokens_per_second': '1977'} +{'loss': '1.42', 'grad_norm': '2.671', 'learning_rate': '2.398e-05', 'epoch': '0.04799', 'num_input_tokens_seen': 3901582, 'train_runtime': '1974', 'train_tokens_per_second': '1977'} +{'loss': '0.9878', 'grad_norm': '2.58', 'learning_rate': '2.399e-05', 'epoch': '0.04801', 'num_input_tokens_seen': 3903629, 'train_runtime': '1975', 'train_tokens_per_second': '1977'} +{'loss': '0.8615', 'grad_norm': '1.991', 'learning_rate': '2.401e-05', 'epoch': '0.04804', 'num_input_tokens_seen': 3905676, 'train_runtime': '1976', 'train_tokens_per_second': '1977'} +{'loss': '1.322', 'grad_norm': '2.716', 'learning_rate': '2.402e-05', 'epoch': '0.04807', 'num_input_tokens_seen': 3907723, 'train_runtime': '1977', 'train_tokens_per_second': '1977'} +{'loss': '0.459', 'grad_norm': '1.748', 'learning_rate': '2.403e-05', 'epoch': '0.04809', 'num_input_tokens_seen': 3909770, 'train_runtime': '1978', 'train_tokens_per_second': '1977'} +{'loss': '1.753', 'grad_norm': '2.994', 'learning_rate': '2.404e-05', 'epoch': '0.04812', 'num_input_tokens_seen': 3911817, 'train_runtime': '1979', 'train_tokens_per_second': '1977'} +{'loss': '0.552', 'grad_norm': '1.978', 'learning_rate': '2.406e-05', 'epoch': '0.04814', 'num_input_tokens_seen': 3913864, 'train_runtime': '1980', 'train_tokens_per_second': '1977'} +{'loss': '1.258', 'grad_norm': '2.509', 'learning_rate': '2.407e-05', 'epoch': '0.04817', 'num_input_tokens_seen': 3915911, 'train_runtime': '1981', 'train_tokens_per_second': '1977'} +{'loss': '0.4642', 'grad_norm': '1.776', 'learning_rate': '2.408e-05', 'epoch': '0.04819', 'num_input_tokens_seen': 3917958, 'train_runtime': '1982', 'train_tokens_per_second': '1977'} +{'loss': '0.9064', 'grad_norm': '2.767', 'learning_rate': '2.409e-05', 'epoch': '0.04822', 'num_input_tokens_seen': 3920005, 'train_runtime': '1983', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.828', 'learning_rate': '2.411e-05', 'epoch': '0.04824', 'num_input_tokens_seen': 3922052, 'train_runtime': '1984', 'train_tokens_per_second': '1977'} +{'loss': '1.861', 'grad_norm': '3.398', 'learning_rate': '2.412e-05', 'epoch': '0.04827', 'num_input_tokens_seen': 3924099, 'train_runtime': '1985', 'train_tokens_per_second': '1977'} +{'loss': '0.928', 'grad_norm': '2.531', 'learning_rate': '2.413e-05', 'epoch': '0.04829', 'num_input_tokens_seen': 3926146, 'train_runtime': '1986', 'train_tokens_per_second': '1977'} +{'loss': '1.707', 'grad_norm': '3.79', 'learning_rate': '2.414e-05', 'epoch': '0.04832', 'num_input_tokens_seen': 3928193, 'train_runtime': '1987', 'train_tokens_per_second': '1977'} +{'loss': '1.477', 'grad_norm': '3.519', 'learning_rate': '2.416e-05', 'epoch': '0.04834', 'num_input_tokens_seen': 3930240, 'train_runtime': '1988', 'train_tokens_per_second': '1977'} +{'loss': '1.377', 'grad_norm': '2.807', 'learning_rate': '2.417e-05', 'epoch': '0.04837', 'num_input_tokens_seen': 3932287, 'train_runtime': '1989', 'train_tokens_per_second': '1977'} +{'loss': '0.7146', 'grad_norm': '1.874', 'learning_rate': '2.418e-05', 'epoch': '0.04839', 'num_input_tokens_seen': 3934334, 'train_runtime': '1990', 'train_tokens_per_second': '1977'} +{'loss': '2.126', 'grad_norm': '3.372', 'learning_rate': '2.419e-05', 'epoch': '0.04842', 'num_input_tokens_seen': 3936381, 'train_runtime': '1991', 'train_tokens_per_second': '1977'} +{'loss': '0.4681', 'grad_norm': '1.584', 'learning_rate': '2.421e-05', 'epoch': '0.04844', 'num_input_tokens_seen': 3938428, 'train_runtime': '1992', 'train_tokens_per_second': '1977'} +{'loss': '0.879', 'grad_norm': '2.381', 'learning_rate': '2.422e-05', 'epoch': '0.04847', 'num_input_tokens_seen': 3940475, 'train_runtime': '1993', 'train_tokens_per_second': '1977'} +{'loss': '0.9473', 'grad_norm': '2.158', 'learning_rate': '2.423e-05', 'epoch': '0.04849', 'num_input_tokens_seen': 3942522, 'train_runtime': '1994', 'train_tokens_per_second': '1977'} +{'loss': '0.4209', 'grad_norm': '1.643', 'learning_rate': '2.424e-05', 'epoch': '0.04852', 'num_input_tokens_seen': 3944569, 'train_runtime': '1996', 'train_tokens_per_second': '1977'} +{'loss': '1.209', 'grad_norm': '2.849', 'learning_rate': '2.426e-05', 'epoch': '0.04854', 'num_input_tokens_seen': 3946616, 'train_runtime': '1997', 'train_tokens_per_second': '1977'} +{'loss': '1.629', 'grad_norm': '2.679', 'learning_rate': '2.427e-05', 'epoch': '0.04857', 'num_input_tokens_seen': 3948663, 'train_runtime': '1998', 'train_tokens_per_second': '1977'} +{'loss': '0.5891', 'grad_norm': '1.825', 'learning_rate': '2.428e-05', 'epoch': '0.04859', 'num_input_tokens_seen': 3950710, 'train_runtime': '1999', 'train_tokens_per_second': '1977'} +{'loss': '2.479', 'grad_norm': '3.394', 'learning_rate': '2.43e-05', 'epoch': '0.04862', 'num_input_tokens_seen': 3952757, 'train_runtime': '2000', 'train_tokens_per_second': '1977'} +{'loss': '0.4807', 'grad_norm': '1.953', 'learning_rate': '2.431e-05', 'epoch': '0.04864', 'num_input_tokens_seen': 3954804, 'train_runtime': '2001', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '2.757', 'learning_rate': '2.432e-05', 'epoch': '0.04867', 'num_input_tokens_seen': 3956851, 'train_runtime': '2002', 'train_tokens_per_second': '1977'} +{'loss': '0.5059', 'grad_norm': '1.314', 'learning_rate': '2.433e-05', 'epoch': '0.04869', 'num_input_tokens_seen': 3958898, 'train_runtime': '2003', 'train_tokens_per_second': '1977'} +{'loss': '0.9945', 'grad_norm': '2.266', 'learning_rate': '2.435e-05', 'epoch': '0.04872', 'num_input_tokens_seen': 3960945, 'train_runtime': '2004', 'train_tokens_per_second': '1977'} +{'loss': '0.3409', 'grad_norm': '1.485', 'learning_rate': '2.436e-05', 'epoch': '0.04874', 'num_input_tokens_seen': 3962992, 'train_runtime': '2005', 'train_tokens_per_second': '1977'} +{'loss': '0.439', 'grad_norm': '1.658', 'learning_rate': '2.437e-05', 'epoch': '0.04877', 'num_input_tokens_seen': 3965039, 'train_runtime': '2006', 'train_tokens_per_second': '1977'} +{'loss': '0.6429', 'grad_norm': '2.075', 'learning_rate': '2.438e-05', 'epoch': '0.0488', 'num_input_tokens_seen': 3967086, 'train_runtime': '2007', 'train_tokens_per_second': '1977'} +{'loss': '1.207', 'grad_norm': '2.807', 'learning_rate': '2.44e-05', 'epoch': '0.04882', 'num_input_tokens_seen': 3969133, 'train_runtime': '2008', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '3.089', 'learning_rate': '2.441e-05', 'epoch': '0.04885', 'num_input_tokens_seen': 3971180, 'train_runtime': '2009', 'train_tokens_per_second': '1977'} +{'loss': '0.4657', 'grad_norm': '1.621', 'learning_rate': '2.442e-05', 'epoch': '0.04887', 'num_input_tokens_seen': 3973227, 'train_runtime': '2010', 'train_tokens_per_second': '1977'} +{'loss': '0.9054', 'grad_norm': '2.67', 'learning_rate': '2.443e-05', 'epoch': '0.0489', 'num_input_tokens_seen': 3975274, 'train_runtime': '2011', 'train_tokens_per_second': '1977'} +{'loss': '0.5138', 'grad_norm': '1.841', 'learning_rate': '2.445e-05', 'epoch': '0.04892', 'num_input_tokens_seen': 3977321, 'train_runtime': '2012', 'train_tokens_per_second': '1977'} +{'loss': '3.583', 'grad_norm': '3.322', 'learning_rate': '2.446e-05', 'epoch': '0.04895', 'num_input_tokens_seen': 3979368, 'train_runtime': '2013', 'train_tokens_per_second': '1977'} +{'loss': '2.219', 'grad_norm': '3.417', 'learning_rate': '2.447e-05', 'epoch': '0.04897', 'num_input_tokens_seen': 3981415, 'train_runtime': '2014', 'train_tokens_per_second': '1977'} +{'loss': '0.9568', 'grad_norm': '2.05', 'learning_rate': '2.448e-05', 'epoch': '0.049', 'num_input_tokens_seen': 3983462, 'train_runtime': '2015', 'train_tokens_per_second': '1977'} +{'loss': '2.691', 'grad_norm': '3.377', 'learning_rate': '2.45e-05', 'epoch': '0.04902', 'num_input_tokens_seen': 3985509, 'train_runtime': '2016', 'train_tokens_per_second': '1977'} +{'loss': '1.991', 'grad_norm': '3.122', 'learning_rate': '2.451e-05', 'epoch': '0.04905', 'num_input_tokens_seen': 3987556, 'train_runtime': '2017', 'train_tokens_per_second': '1977'} +{'loss': '1.879', 'grad_norm': '3.287', 'learning_rate': '2.452e-05', 'epoch': '0.04907', 'num_input_tokens_seen': 3989603, 'train_runtime': '2018', 'train_tokens_per_second': '1977'} +{'loss': '0.9327', 'grad_norm': '2.388', 'learning_rate': '2.453e-05', 'epoch': '0.0491', 'num_input_tokens_seen': 3991650, 'train_runtime': '2019', 'train_tokens_per_second': '1977'} +{'loss': '0.904', 'grad_norm': '2.434', 'learning_rate': '2.455e-05', 'epoch': '0.04912', 'num_input_tokens_seen': 3993697, 'train_runtime': '2020', 'train_tokens_per_second': '1977'} +{'loss': '2.042', 'grad_norm': '3.121', 'learning_rate': '2.456e-05', 'epoch': '0.04915', 'num_input_tokens_seen': 3995744, 'train_runtime': '2021', 'train_tokens_per_second': '1977'} +{'loss': '1.954', 'grad_norm': '3.715', 'learning_rate': '2.457e-05', 'epoch': '0.04917', 'num_input_tokens_seen': 3997791, 'train_runtime': '2022', 'train_tokens_per_second': '1977'} +{'loss': '0.4014', 'grad_norm': '1.78', 'learning_rate': '2.458e-05', 'epoch': '0.0492', 'num_input_tokens_seen': 3999838, 'train_runtime': '2023', 'train_tokens_per_second': '1977'} +{'loss': '1.321', 'grad_norm': '3.004', 'learning_rate': '2.46e-05', 'epoch': '0.04922', 'num_input_tokens_seen': 4001885, 'train_runtime': '2024', 'train_tokens_per_second': '1977'} +{'loss': '0.9816', 'grad_norm': '2.083', 'learning_rate': '2.461e-05', 'epoch': '0.04925', 'num_input_tokens_seen': 4003932, 'train_runtime': '2025', 'train_tokens_per_second': '1977'} +{'loss': '0.8556', 'grad_norm': '2.502', 'learning_rate': '2.462e-05', 'epoch': '0.04927', 'num_input_tokens_seen': 4005979, 'train_runtime': '2027', 'train_tokens_per_second': '1977'} +{'loss': '0.4706', 'grad_norm': '1.574', 'learning_rate': '2.463e-05', 'epoch': '0.0493', 'num_input_tokens_seen': 4008026, 'train_runtime': '2028', 'train_tokens_per_second': '1977'} +{'loss': '1.533', 'grad_norm': '3.017', 'learning_rate': '2.465e-05', 'epoch': '0.04932', 'num_input_tokens_seen': 4010073, 'train_runtime': '2029', 'train_tokens_per_second': '1977'} +{'loss': '0.4285', 'grad_norm': '1.358', 'learning_rate': '2.466e-05', 'epoch': '0.04935', 'num_input_tokens_seen': 4012120, 'train_runtime': '2030', 'train_tokens_per_second': '1977'} +{'loss': '1.139', 'grad_norm': '3.319', 'learning_rate': '2.467e-05', 'epoch': '0.04937', 'num_input_tokens_seen': 4014167, 'train_runtime': '2031', 'train_tokens_per_second': '1977'} +{'loss': '1.397', 'grad_norm': '3.044', 'learning_rate': '2.469e-05', 'epoch': '0.0494', 'num_input_tokens_seen': 4016214, 'train_runtime': '2032', 'train_tokens_per_second': '1977'} +{'loss': '1.965', 'grad_norm': '3.155', 'learning_rate': '2.47e-05', 'epoch': '0.04942', 'num_input_tokens_seen': 4018261, 'train_runtime': '2033', 'train_tokens_per_second': '1977'} +{'loss': '1.293', 'grad_norm': '3.409', 'learning_rate': '2.471e-05', 'epoch': '0.04945', 'num_input_tokens_seen': 4020308, 'train_runtime': '2034', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '3.654', 'learning_rate': '2.472e-05', 'epoch': '0.04948', 'num_input_tokens_seen': 4022355, 'train_runtime': '2035', 'train_tokens_per_second': '1977'} +{'loss': '1.169', 'grad_norm': '2.51', 'learning_rate': '2.474e-05', 'epoch': '0.0495', 'num_input_tokens_seen': 4024402, 'train_runtime': '2036', 'train_tokens_per_second': '1977'} +{'loss': '1.372', 'grad_norm': '2.967', 'learning_rate': '2.475e-05', 'epoch': '0.04953', 'num_input_tokens_seen': 4026449, 'train_runtime': '2037', 'train_tokens_per_second': '1977'} +{'loss': '0.8743', 'grad_norm': '2.383', 'learning_rate': '2.476e-05', 'epoch': '0.04955', 'num_input_tokens_seen': 4028496, 'train_runtime': '2038', 'train_tokens_per_second': '1977'} +{'loss': '0.475', 'grad_norm': '1.842', 'learning_rate': '2.477e-05', 'epoch': '0.04958', 'num_input_tokens_seen': 4030543, 'train_runtime': '2039', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '2.478', 'learning_rate': '2.479e-05', 'epoch': '0.0496', 'num_input_tokens_seen': 4032590, 'train_runtime': '2040', 'train_tokens_per_second': '1977'} +{'loss': '0.7359', 'grad_norm': '2.171', 'learning_rate': '2.48e-05', 'epoch': '0.04963', 'num_input_tokens_seen': 4034637, 'train_runtime': '2041', 'train_tokens_per_second': '1977'} +{'loss': '0.9384', 'grad_norm': '2.104', 'learning_rate': '2.481e-05', 'epoch': '0.04965', 'num_input_tokens_seen': 4036684, 'train_runtime': '2042', 'train_tokens_per_second': '1977'} +{'loss': '0.8099', 'grad_norm': '2.225', 'learning_rate': '2.482e-05', 'epoch': '0.04968', 'num_input_tokens_seen': 4038731, 'train_runtime': '2043', 'train_tokens_per_second': '1977'} +{'loss': '0.7464', 'grad_norm': '2.214', 'learning_rate': '2.484e-05', 'epoch': '0.0497', 'num_input_tokens_seen': 4040778, 'train_runtime': '2044', 'train_tokens_per_second': '1977'} +{'loss': '1.153', 'grad_norm': '2.311', 'learning_rate': '2.485e-05', 'epoch': '0.04973', 'num_input_tokens_seen': 4042825, 'train_runtime': '2045', 'train_tokens_per_second': '1977'} +{'loss': '2.108', 'grad_norm': '3.552', 'learning_rate': '2.486e-05', 'epoch': '0.04975', 'num_input_tokens_seen': 4044872, 'train_runtime': '2046', 'train_tokens_per_second': '1977'} +{'loss': '1.637', 'grad_norm': '3.261', 'learning_rate': '2.487e-05', 'epoch': '0.04978', 'num_input_tokens_seen': 4046919, 'train_runtime': '2047', 'train_tokens_per_second': '1977'} +{'loss': '0.9703', 'grad_norm': '2.756', 'learning_rate': '2.489e-05', 'epoch': '0.0498', 'num_input_tokens_seen': 4048966, 'train_runtime': '2048', 'train_tokens_per_second': '1977'} +{'loss': '1.541', 'grad_norm': '2.853', 'learning_rate': '2.49e-05', 'epoch': '0.04983', 'num_input_tokens_seen': 4051013, 'train_runtime': '2049', 'train_tokens_per_second': '1977'} +{'loss': '0.9496', 'grad_norm': '2.84', 'learning_rate': '2.491e-05', 'epoch': '0.04985', 'num_input_tokens_seen': 4053060, 'train_runtime': '2050', 'train_tokens_per_second': '1977'} +{'loss': '1.269', 'grad_norm': '2.967', 'learning_rate': '2.492e-05', 'epoch': '0.04988', 'num_input_tokens_seen': 4055107, 'train_runtime': '2051', 'train_tokens_per_second': '1977'} +{'loss': '0.4281', 'grad_norm': '1.978', 'learning_rate': '2.494e-05', 'epoch': '0.0499', 'num_input_tokens_seen': 4057154, 'train_runtime': '2052', 'train_tokens_per_second': '1977'} +{'loss': '1.041', 'grad_norm': '2.724', 'learning_rate': '2.495e-05', 'epoch': '0.04993', 'num_input_tokens_seen': 4059201, 'train_runtime': '2053', 'train_tokens_per_second': '1977'} +{'loss': '0.6475', 'grad_norm': '1.974', 'learning_rate': '2.496e-05', 'epoch': '0.04995', 'num_input_tokens_seen': 4061248, 'train_runtime': '2054', 'train_tokens_per_second': '1977'} +{'loss': '1.68', 'grad_norm': '3.548', 'learning_rate': '2.497e-05', 'epoch': '0.04998', 'num_input_tokens_seen': 4063295, 'train_runtime': '2056', 'train_tokens_per_second': '1977'} +{'loss': '0.8166', 'grad_norm': '2.757', 'learning_rate': '2.499e-05', 'epoch': '0.05', 'num_input_tokens_seen': 4065342, 'train_runtime': '2057', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '2.632', 'learning_rate': '2.5e-05', 'epoch': '0.05003', 'num_input_tokens_seen': 4067389, 'train_runtime': '2058', 'train_tokens_per_second': '1977'} +{'loss': '0.8006', 'grad_norm': '2.565', 'learning_rate': '2.501e-05', 'epoch': '0.05005', 'num_input_tokens_seen': 4069436, 'train_runtime': '2059', 'train_tokens_per_second': '1977'} +{'loss': '1.204', 'grad_norm': '2.572', 'learning_rate': '2.503e-05', 'epoch': '0.05008', 'num_input_tokens_seen': 4071483, 'train_runtime': '2060', 'train_tokens_per_second': '1977'} +{'loss': '0.47', 'grad_norm': '2.05', 'learning_rate': '2.504e-05', 'epoch': '0.0501', 'num_input_tokens_seen': 4073530, 'train_runtime': '2061', 'train_tokens_per_second': '1977'} +{'loss': '0.9068', 'grad_norm': '2.679', 'learning_rate': '2.505e-05', 'epoch': '0.05013', 'num_input_tokens_seen': 4075577, 'train_runtime': '2062', 'train_tokens_per_second': '1977'} +{'loss': '0.4638', 'grad_norm': '1.674', 'learning_rate': '2.506e-05', 'epoch': '0.05015', 'num_input_tokens_seen': 4077624, 'train_runtime': '2063', 'train_tokens_per_second': '1977'} +{'loss': '2.757', 'grad_norm': '3.589', 'learning_rate': '2.508e-05', 'epoch': '0.05018', 'num_input_tokens_seen': 4079671, 'train_runtime': '2064', 'train_tokens_per_second': '1977'} +{'loss': '0.9035', 'grad_norm': '2.826', 'learning_rate': '2.509e-05', 'epoch': '0.05021', 'num_input_tokens_seen': 4081718, 'train_runtime': '2065', 'train_tokens_per_second': '1977'} +{'loss': '0.508', 'grad_norm': '1.591', 'learning_rate': '2.51e-05', 'epoch': '0.05023', 'num_input_tokens_seen': 4083765, 'train_runtime': '2066', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '2.68', 'learning_rate': '2.511e-05', 'epoch': '0.05026', 'num_input_tokens_seen': 4085812, 'train_runtime': '2067', 'train_tokens_per_second': '1977'} +{'loss': '0.4495', 'grad_norm': '1.814', 'learning_rate': '2.513e-05', 'epoch': '0.05028', 'num_input_tokens_seen': 4087859, 'train_runtime': '2068', 'train_tokens_per_second': '1977'} +{'loss': '0.644', 'grad_norm': '1.821', 'learning_rate': '2.514e-05', 'epoch': '0.05031', 'num_input_tokens_seen': 4089906, 'train_runtime': '2069', 'train_tokens_per_second': '1977'} +{'loss': '0.7976', 'grad_norm': '2.569', 'learning_rate': '2.515e-05', 'epoch': '0.05033', 'num_input_tokens_seen': 4091953, 'train_runtime': '2070', 'train_tokens_per_second': '1977'} +{'loss': '0.3857', 'grad_norm': '1.792', 'learning_rate': '2.516e-05', 'epoch': '0.05036', 'num_input_tokens_seen': 4094000, 'train_runtime': '2071', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 03:11:56,512 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 03:11:56,512 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 03:11:56,892 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-2000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 03:11:56,899 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-2000/tokenizer_config.json + +{'loss': '1.498', 'grad_norm': '3.416', 'learning_rate': '2.518e-05', 'epoch': '0.05038', 'num_input_tokens_seen': 4096047, 'train_runtime': '2073', 'train_tokens_per_second': '1976'} +{'loss': '0.4731', 'grad_norm': '1.991', 'learning_rate': '2.519e-05', 'epoch': '0.05041', 'num_input_tokens_seen': 4098094, 'train_runtime': '2074', 'train_tokens_per_second': '1976'} +{'loss': '1.235', 'grad_norm': '2.6', 'learning_rate': '2.52e-05', 'epoch': '0.05043', 'num_input_tokens_seen': 4100141, 'train_runtime': '2075', 'train_tokens_per_second': '1976'} +{'loss': '0.3796', 'grad_norm': '1.722', 'learning_rate': '2.521e-05', 'epoch': '0.05046', 'num_input_tokens_seen': 4102188, 'train_runtime': '2076', 'train_tokens_per_second': '1976'} +{'loss': '2.496', 'grad_norm': '3.481', 'learning_rate': '2.523e-05', 'epoch': '0.05048', 'num_input_tokens_seen': 4104235, 'train_runtime': '2077', 'train_tokens_per_second': '1976'} +{'loss': '2.709', 'grad_norm': '4.023', 'learning_rate': '2.524e-05', 'epoch': '0.05051', 'num_input_tokens_seen': 4106282, 'train_runtime': '2078', 'train_tokens_per_second': '1976'} +{'loss': '2.676', 'grad_norm': '3.944', 'learning_rate': '2.525e-05', 'epoch': '0.05053', 'num_input_tokens_seen': 4108329, 'train_runtime': '2079', 'train_tokens_per_second': '1976'} +{'loss': '1.47', 'grad_norm': '2.6', 'learning_rate': '2.526e-05', 'epoch': '0.05056', 'num_input_tokens_seen': 4110376, 'train_runtime': '2080', 'train_tokens_per_second': '1976'} +{'loss': '1.038', 'grad_norm': '2.351', 'learning_rate': '2.528e-05', 'epoch': '0.05058', 'num_input_tokens_seen': 4112423, 'train_runtime': '2081', 'train_tokens_per_second': '1976'} +{'loss': '1.453', 'grad_norm': '3.16', 'learning_rate': '2.529e-05', 'epoch': '0.05061', 'num_input_tokens_seen': 4114470, 'train_runtime': '2082', 'train_tokens_per_second': '1976'} +{'loss': '1.948', 'grad_norm': '2.763', 'learning_rate': '2.53e-05', 'epoch': '0.05063', 'num_input_tokens_seen': 4116517, 'train_runtime': '2083', 'train_tokens_per_second': '1976'} +{'loss': '0.8533', 'grad_norm': '2.879', 'learning_rate': '2.531e-05', 'epoch': '0.05066', 'num_input_tokens_seen': 4118564, 'train_runtime': '2084', 'train_tokens_per_second': '1976'} +{'loss': '0.8158', 'grad_norm': '2.577', 'learning_rate': '2.533e-05', 'epoch': '0.05068', 'num_input_tokens_seen': 4120611, 'train_runtime': '2085', 'train_tokens_per_second': '1976'} +{'loss': '2.169', 'grad_norm': '3.08', 'learning_rate': '2.534e-05', 'epoch': '0.05071', 'num_input_tokens_seen': 4122658, 'train_runtime': '2086', 'train_tokens_per_second': '1976'} +{'loss': '0.4706', 'grad_norm': '2.015', 'learning_rate': '2.535e-05', 'epoch': '0.05073', 'num_input_tokens_seen': 4124705, 'train_runtime': '2087', 'train_tokens_per_second': '1976'} +{'loss': '1.334', 'grad_norm': '2.816', 'learning_rate': '2.537e-05', 'epoch': '0.05076', 'num_input_tokens_seen': 4126752, 'train_runtime': '2088', 'train_tokens_per_second': '1976'} +{'loss': '2.41', 'grad_norm': '3.687', 'learning_rate': '2.538e-05', 'epoch': '0.05078', 'num_input_tokens_seen': 4128799, 'train_runtime': '2089', 'train_tokens_per_second': '1976'} +{'loss': '1.108', 'grad_norm': '2.535', 'learning_rate': '2.539e-05', 'epoch': '0.05081', 'num_input_tokens_seen': 4130846, 'train_runtime': '2090', 'train_tokens_per_second': '1976'} +{'loss': '1.012', 'grad_norm': '2.39', 'learning_rate': '2.54e-05', 'epoch': '0.05083', 'num_input_tokens_seen': 4132893, 'train_runtime': '2091', 'train_tokens_per_second': '1976'} +{'loss': '1.962', 'grad_norm': '3.706', 'learning_rate': '2.542e-05', 'epoch': '0.05086', 'num_input_tokens_seen': 4134940, 'train_runtime': '2092', 'train_tokens_per_second': '1976'} +{'loss': '0.9657', 'grad_norm': '2.464', 'learning_rate': '2.543e-05', 'epoch': '0.05089', 'num_input_tokens_seen': 4136987, 'train_runtime': '2093', 'train_tokens_per_second': '1976'} +{'loss': '1.1', 'grad_norm': '2.358', 'learning_rate': '2.544e-05', 'epoch': '0.05091', 'num_input_tokens_seen': 4139034, 'train_runtime': '2094', 'train_tokens_per_second': '1976'} +{'loss': '0.4886', 'grad_norm': '2.307', 'learning_rate': '2.545e-05', 'epoch': '0.05094', 'num_input_tokens_seen': 4141081, 'train_runtime': '2095', 'train_tokens_per_second': '1976'} +{'loss': '1.54', 'grad_norm': '5.331', 'learning_rate': '2.547e-05', 'epoch': '0.05096', 'num_input_tokens_seen': 4143128, 'train_runtime': '2096', 'train_tokens_per_second': '1976'} +{'loss': '1.063', 'grad_norm': '2.965', 'learning_rate': '2.548e-05', 'epoch': '0.05099', 'num_input_tokens_seen': 4145175, 'train_runtime': '2097', 'train_tokens_per_second': '1976'} +{'loss': '0.7199', 'grad_norm': '2.696', 'learning_rate': '2.549e-05', 'epoch': '0.05101', 'num_input_tokens_seen': 4147222, 'train_runtime': '2098', 'train_tokens_per_second': '1976'} +{'loss': '0.5743', 'grad_norm': '2.079', 'learning_rate': '2.55e-05', 'epoch': '0.05104', 'num_input_tokens_seen': 4149269, 'train_runtime': '2099', 'train_tokens_per_second': '1976'} +{'loss': '1.303', 'grad_norm': '2.601', 'learning_rate': '2.552e-05', 'epoch': '0.05106', 'num_input_tokens_seen': 4151316, 'train_runtime': '2100', 'train_tokens_per_second': '1976'} +{'loss': '0.8933', 'grad_norm': '2.14', 'learning_rate': '2.553e-05', 'epoch': '0.05109', 'num_input_tokens_seen': 4153363, 'train_runtime': '2102', 'train_tokens_per_second': '1976'} +{'loss': '1.51', 'grad_norm': '2.741', 'learning_rate': '2.554e-05', 'epoch': '0.05111', 'num_input_tokens_seen': 4155410, 'train_runtime': '2103', 'train_tokens_per_second': '1976'} +{'loss': '0.5083', 'grad_norm': '1.982', 'learning_rate': '2.555e-05', 'epoch': '0.05114', 'num_input_tokens_seen': 4157457, 'train_runtime': '2104', 'train_tokens_per_second': '1976'} +{'loss': '0.8794', 'grad_norm': '2.522', 'learning_rate': '2.557e-05', 'epoch': '0.05116', 'num_input_tokens_seen': 4159504, 'train_runtime': '2105', 'train_tokens_per_second': '1976'} +{'loss': '0.7681', 'grad_norm': '2.341', 'learning_rate': '2.558e-05', 'epoch': '0.05119', 'num_input_tokens_seen': 4161551, 'train_runtime': '2106', 'train_tokens_per_second': '1976'} +{'loss': '0.9929', 'grad_norm': '2.446', 'learning_rate': '2.559e-05', 'epoch': '0.05121', 'num_input_tokens_seen': 4163598, 'train_runtime': '2107', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '2.237', 'learning_rate': '2.56e-05', 'epoch': '0.05124', 'num_input_tokens_seen': 4165645, 'train_runtime': '2108', 'train_tokens_per_second': '1976'} +{'loss': '2.027', 'grad_norm': '3.23', 'learning_rate': '2.562e-05', 'epoch': '0.05126', 'num_input_tokens_seen': 4167692, 'train_runtime': '2109', 'train_tokens_per_second': '1976'} +{'loss': '1.862', 'grad_norm': '3.469', 'learning_rate': '2.563e-05', 'epoch': '0.05129', 'num_input_tokens_seen': 4169739, 'train_runtime': '2110', 'train_tokens_per_second': '1976'} +{'loss': '1.449', 'grad_norm': '3.012', 'learning_rate': '2.564e-05', 'epoch': '0.05131', 'num_input_tokens_seen': 4171786, 'train_runtime': '2111', 'train_tokens_per_second': '1976'} +{'loss': '2.011', 'grad_norm': '3.419', 'learning_rate': '2.565e-05', 'epoch': '0.05134', 'num_input_tokens_seen': 4173833, 'train_runtime': '2112', 'train_tokens_per_second': '1976'} +{'loss': '2.852', 'grad_norm': '3.868', 'learning_rate': '2.567e-05', 'epoch': '0.05136', 'num_input_tokens_seen': 4175880, 'train_runtime': '2113', 'train_tokens_per_second': '1976'} +{'loss': '0.4564', 'grad_norm': '2.123', 'learning_rate': '2.568e-05', 'epoch': '0.05139', 'num_input_tokens_seen': 4177927, 'train_runtime': '2114', 'train_tokens_per_second': '1976'} +{'loss': '0.9901', 'grad_norm': '2.319', 'learning_rate': '2.569e-05', 'epoch': '0.05141', 'num_input_tokens_seen': 4179974, 'train_runtime': '2115', 'train_tokens_per_second': '1976'} +{'loss': '1.221', 'grad_norm': '2.467', 'learning_rate': '2.57e-05', 'epoch': '0.05144', 'num_input_tokens_seen': 4182021, 'train_runtime': '2116', 'train_tokens_per_second': '1976'} +{'loss': '1.109', 'grad_norm': '2.58', 'learning_rate': '2.572e-05', 'epoch': '0.05146', 'num_input_tokens_seen': 4184068, 'train_runtime': '2117', 'train_tokens_per_second': '1976'} +{'loss': '0.6672', 'grad_norm': '2.229', 'learning_rate': '2.573e-05', 'epoch': '0.05149', 'num_input_tokens_seen': 4186115, 'train_runtime': '2118', 'train_tokens_per_second': '1976'} +{'loss': '0.9117', 'grad_norm': '2.48', 'learning_rate': '2.574e-05', 'epoch': '0.05151', 'num_input_tokens_seen': 4188162, 'train_runtime': '2119', 'train_tokens_per_second': '1976'} +{'loss': '1.157', 'grad_norm': '2.16', 'learning_rate': '2.576e-05', 'epoch': '0.05154', 'num_input_tokens_seen': 4190209, 'train_runtime': '2120', 'train_tokens_per_second': '1976'} +{'loss': '0.692', 'grad_norm': '1.909', 'learning_rate': '2.577e-05', 'epoch': '0.05156', 'num_input_tokens_seen': 4192256, 'train_runtime': '2121', 'train_tokens_per_second': '1976'} +{'loss': '0.5003', 'grad_norm': '1.359', 'learning_rate': '2.578e-05', 'epoch': '0.05159', 'num_input_tokens_seen': 4194303, 'train_runtime': '2122', 'train_tokens_per_second': '1976'} +{'loss': '0.8229', 'grad_norm': '1.858', 'learning_rate': '2.579e-05', 'epoch': '0.05162', 'num_input_tokens_seen': 4196350, 'train_runtime': '2123', 'train_tokens_per_second': '1976'} +{'loss': '0.6838', 'grad_norm': '2.211', 'learning_rate': '2.581e-05', 'epoch': '0.05164', 'num_input_tokens_seen': 4198397, 'train_runtime': '2124', 'train_tokens_per_second': '1976'} +{'loss': '0.3767', 'grad_norm': '1.488', 'learning_rate': '2.582e-05', 'epoch': '0.05167', 'num_input_tokens_seen': 4200444, 'train_runtime': '2125', 'train_tokens_per_second': '1976'} +{'loss': '0.3559', 'grad_norm': '1.377', 'learning_rate': '2.583e-05', 'epoch': '0.05169', 'num_input_tokens_seen': 4202491, 'train_runtime': '2126', 'train_tokens_per_second': '1976'} +{'loss': '2.378', 'grad_norm': '2.497', 'learning_rate': '2.584e-05', 'epoch': '0.05172', 'num_input_tokens_seen': 4204538, 'train_runtime': '2127', 'train_tokens_per_second': '1976'} +{'loss': '1.015', 'grad_norm': '2.37', 'learning_rate': '2.586e-05', 'epoch': '0.05174', 'num_input_tokens_seen': 4206585, 'train_runtime': '2128', 'train_tokens_per_second': '1976'} +{'loss': '0.7102', 'grad_norm': '2.127', 'learning_rate': '2.587e-05', 'epoch': '0.05177', 'num_input_tokens_seen': 4208632, 'train_runtime': '2129', 'train_tokens_per_second': '1976'} +{'loss': '0.9664', 'grad_norm': '2.416', 'learning_rate': '2.588e-05', 'epoch': '0.05179', 'num_input_tokens_seen': 4210679, 'train_runtime': '2130', 'train_tokens_per_second': '1976'} +{'loss': '0.8573', 'grad_norm': '2.026', 'learning_rate': '2.589e-05', 'epoch': '0.05182', 'num_input_tokens_seen': 4212726, 'train_runtime': '2131', 'train_tokens_per_second': '1976'} +{'loss': '1.324', 'grad_norm': '2.768', 'learning_rate': '2.591e-05', 'epoch': '0.05184', 'num_input_tokens_seen': 4214773, 'train_runtime': '2133', 'train_tokens_per_second': '1976'} +{'loss': '0.5007', 'grad_norm': '1.767', 'learning_rate': '2.592e-05', 'epoch': '0.05187', 'num_input_tokens_seen': 4216820, 'train_runtime': '2134', 'train_tokens_per_second': '1976'} +{'loss': '0.4422', 'grad_norm': '1.813', 'learning_rate': '2.593e-05', 'epoch': '0.05189', 'num_input_tokens_seen': 4218867, 'train_runtime': '2135', 'train_tokens_per_second': '1976'} +{'loss': '1.451', 'grad_norm': '2.892', 'learning_rate': '2.594e-05', 'epoch': '0.05192', 'num_input_tokens_seen': 4220914, 'train_runtime': '2136', 'train_tokens_per_second': '1976'} +{'loss': '0.4763', 'grad_norm': '2.008', 'learning_rate': '2.596e-05', 'epoch': '0.05194', 'num_input_tokens_seen': 4222961, 'train_runtime': '2137', 'train_tokens_per_second': '1976'} +{'loss': '0.7699', 'grad_norm': '2.335', 'learning_rate': '2.597e-05', 'epoch': '0.05197', 'num_input_tokens_seen': 4225008, 'train_runtime': '2138', 'train_tokens_per_second': '1976'} +{'loss': '0.4304', 'grad_norm': '1.586', 'learning_rate': '2.598e-05', 'epoch': '0.05199', 'num_input_tokens_seen': 4227055, 'train_runtime': '2139', 'train_tokens_per_second': '1976'} +{'loss': '0.7481', 'grad_norm': '2.04', 'learning_rate': '2.599e-05', 'epoch': '0.05202', 'num_input_tokens_seen': 4229102, 'train_runtime': '2140', 'train_tokens_per_second': '1976'} +{'loss': '0.6974', 'grad_norm': '2.086', 'learning_rate': '2.601e-05', 'epoch': '0.05204', 'num_input_tokens_seen': 4231149, 'train_runtime': '2141', 'train_tokens_per_second': '1976'} +{'loss': '0.9468', 'grad_norm': '2.035', 'learning_rate': '2.602e-05', 'epoch': '0.05207', 'num_input_tokens_seen': 4233196, 'train_runtime': '2142', 'train_tokens_per_second': '1976'} +{'loss': '0.6488', 'grad_norm': '2.215', 'learning_rate': '2.603e-05', 'epoch': '0.05209', 'num_input_tokens_seen': 4235243, 'train_runtime': '2143', 'train_tokens_per_second': '1976'} +{'loss': '0.9411', 'grad_norm': '2.353', 'learning_rate': '2.604e-05', 'epoch': '0.05212', 'num_input_tokens_seen': 4237290, 'train_runtime': '2144', 'train_tokens_per_second': '1976'} +{'loss': '0.7929', 'grad_norm': '2.435', 'learning_rate': '2.606e-05', 'epoch': '0.05214', 'num_input_tokens_seen': 4239337, 'train_runtime': '2145', 'train_tokens_per_second': '1976'} +{'loss': '1.063', 'grad_norm': '2.567', 'learning_rate': '2.607e-05', 'epoch': '0.05217', 'num_input_tokens_seen': 4241384, 'train_runtime': '2146', 'train_tokens_per_second': '1976'} +{'loss': '1.996', 'grad_norm': '2.782', 'learning_rate': '2.608e-05', 'epoch': '0.05219', 'num_input_tokens_seen': 4243431, 'train_runtime': '2147', 'train_tokens_per_second': '1976'} +{'loss': '2.804', 'grad_norm': '3.812', 'learning_rate': '2.61e-05', 'epoch': '0.05222', 'num_input_tokens_seen': 4245478, 'train_runtime': '2148', 'train_tokens_per_second': '1976'} +{'loss': '0.3781', 'grad_norm': '1.626', 'learning_rate': '2.611e-05', 'epoch': '0.05224', 'num_input_tokens_seen': 4247525, 'train_runtime': '2149', 'train_tokens_per_second': '1976'} +{'loss': '0.4502', 'grad_norm': '1.552', 'learning_rate': '2.612e-05', 'epoch': '0.05227', 'num_input_tokens_seen': 4249572, 'train_runtime': '2150', 'train_tokens_per_second': '1976'} +{'loss': '0.4987', 'grad_norm': '1.974', 'learning_rate': '2.613e-05', 'epoch': '0.05229', 'num_input_tokens_seen': 4251619, 'train_runtime': '2151', 'train_tokens_per_second': '1976'} +{'loss': '1.884', 'grad_norm': '3.15', 'learning_rate': '2.615e-05', 'epoch': '0.05232', 'num_input_tokens_seen': 4253666, 'train_runtime': '2152', 'train_tokens_per_second': '1976'} +{'loss': '0.5308', 'grad_norm': '2.336', 'learning_rate': '2.616e-05', 'epoch': '0.05235', 'num_input_tokens_seen': 4255713, 'train_runtime': '2153', 'train_tokens_per_second': '1976'} +{'loss': '0.3574', 'grad_norm': '1.841', 'learning_rate': '2.617e-05', 'epoch': '0.05237', 'num_input_tokens_seen': 4257760, 'train_runtime': '2154', 'train_tokens_per_second': '1976'} +{'loss': '1.006', 'grad_norm': '2.529', 'learning_rate': '2.618e-05', 'epoch': '0.0524', 'num_input_tokens_seen': 4259807, 'train_runtime': '2155', 'train_tokens_per_second': '1976'} +{'loss': '2.23', 'grad_norm': '3.31', 'learning_rate': '2.62e-05', 'epoch': '0.05242', 'num_input_tokens_seen': 4261854, 'train_runtime': '2156', 'train_tokens_per_second': '1976'} +{'loss': '1.824', 'grad_norm': '3.159', 'learning_rate': '2.621e-05', 'epoch': '0.05245', 'num_input_tokens_seen': 4263901, 'train_runtime': '2157', 'train_tokens_per_second': '1976'} +{'loss': '0.6182', 'grad_norm': '2.311', 'learning_rate': '2.622e-05', 'epoch': '0.05247', 'num_input_tokens_seen': 4265948, 'train_runtime': '2158', 'train_tokens_per_second': '1976'} +{'loss': '1.761', 'grad_norm': '3.925', 'learning_rate': '2.623e-05', 'epoch': '0.0525', 'num_input_tokens_seen': 4267995, 'train_runtime': '2159', 'train_tokens_per_second': '1976'} +{'loss': '0.4528', 'grad_norm': '1.845', 'learning_rate': '2.625e-05', 'epoch': '0.05252', 'num_input_tokens_seen': 4270042, 'train_runtime': '2160', 'train_tokens_per_second': '1976'} +{'loss': '1.88', 'grad_norm': '3.25', 'learning_rate': '2.626e-05', 'epoch': '0.05255', 'num_input_tokens_seen': 4272089, 'train_runtime': '2161', 'train_tokens_per_second': '1976'} +{'loss': '1.381', 'grad_norm': '3.251', 'learning_rate': '2.627e-05', 'epoch': '0.05257', 'num_input_tokens_seen': 4274136, 'train_runtime': '2163', 'train_tokens_per_second': '1976'} +{'loss': '0.9159', 'grad_norm': '2.39', 'learning_rate': '2.628e-05', 'epoch': '0.0526', 'num_input_tokens_seen': 4276183, 'train_runtime': '2164', 'train_tokens_per_second': '1976'} +{'loss': '0.9172', 'grad_norm': '2.372', 'learning_rate': '2.63e-05', 'epoch': '0.05262', 'num_input_tokens_seen': 4278230, 'train_runtime': '2165', 'train_tokens_per_second': '1976'} +{'loss': '0.6852', 'grad_norm': '2.486', 'learning_rate': '2.631e-05', 'epoch': '0.05265', 'num_input_tokens_seen': 4280277, 'train_runtime': '2166', 'train_tokens_per_second': '1976'} +{'loss': '0.6941', 'grad_norm': '2.206', 'learning_rate': '2.632e-05', 'epoch': '0.05267', 'num_input_tokens_seen': 4282324, 'train_runtime': '2167', 'train_tokens_per_second': '1976'} +{'loss': '0.4537', 'grad_norm': '1.516', 'learning_rate': '2.633e-05', 'epoch': '0.0527', 'num_input_tokens_seen': 4284371, 'train_runtime': '2168', 'train_tokens_per_second': '1976'} +{'loss': '0.3651', 'grad_norm': '1.603', 'learning_rate': '2.635e-05', 'epoch': '0.05272', 'num_input_tokens_seen': 4286418, 'train_runtime': '2169', 'train_tokens_per_second': '1976'} +{'loss': '0.9912', 'grad_norm': '2.418', 'learning_rate': '2.636e-05', 'epoch': '0.05275', 'num_input_tokens_seen': 4288465, 'train_runtime': '2170', 'train_tokens_per_second': '1976'} +{'loss': '0.3787', 'grad_norm': '1.66', 'learning_rate': '2.637e-05', 'epoch': '0.05277', 'num_input_tokens_seen': 4290512, 'train_runtime': '2171', 'train_tokens_per_second': '1976'} +{'loss': '1.315', 'grad_norm': '2.648', 'learning_rate': '2.638e-05', 'epoch': '0.0528', 'num_input_tokens_seen': 4292559, 'train_runtime': '2172', 'train_tokens_per_second': '1976'} +{'loss': '0.8228', 'grad_norm': '2.096', 'learning_rate': '2.64e-05', 'epoch': '0.05282', 'num_input_tokens_seen': 4294606, 'train_runtime': '2173', 'train_tokens_per_second': '1976'} +{'loss': '0.4677', 'grad_norm': '2.058', 'learning_rate': '2.641e-05', 'epoch': '0.05285', 'num_input_tokens_seen': 4296653, 'train_runtime': '2174', 'train_tokens_per_second': '1976'} +{'loss': '1.14', 'grad_norm': '2.318', 'learning_rate': '2.642e-05', 'epoch': '0.05287', 'num_input_tokens_seen': 4298700, 'train_runtime': '2175', 'train_tokens_per_second': '1976'} +{'loss': '0.812', 'grad_norm': '2.39', 'learning_rate': '2.644e-05', 'epoch': '0.0529', 'num_input_tokens_seen': 4300747, 'train_runtime': '2176', 'train_tokens_per_second': '1976'} +{'loss': '0.9975', 'grad_norm': '2.111', 'learning_rate': '2.645e-05', 'epoch': '0.05292', 'num_input_tokens_seen': 4302794, 'train_runtime': '2177', 'train_tokens_per_second': '1976'} +{'loss': '0.6706', 'grad_norm': '1.887', 'learning_rate': '2.646e-05', 'epoch': '0.05295', 'num_input_tokens_seen': 4304841, 'train_runtime': '2178', 'train_tokens_per_second': '1976'} +{'loss': '0.8885', 'grad_norm': '2.128', 'learning_rate': '2.647e-05', 'epoch': '0.05297', 'num_input_tokens_seen': 4306888, 'train_runtime': '2179', 'train_tokens_per_second': '1976'} +{'loss': '0.9473', 'grad_norm': '2.216', 'learning_rate': '2.649e-05', 'epoch': '0.053', 'num_input_tokens_seen': 4308935, 'train_runtime': '2180', 'train_tokens_per_second': '1976'} +{'loss': '0.3784', 'grad_norm': '1.807', 'learning_rate': '2.65e-05', 'epoch': '0.05303', 'num_input_tokens_seen': 4310982, 'train_runtime': '2181', 'train_tokens_per_second': '1976'} +{'loss': '0.4437', 'grad_norm': '1.702', 'learning_rate': '2.651e-05', 'epoch': '0.05305', 'num_input_tokens_seen': 4313029, 'train_runtime': '2182', 'train_tokens_per_second': '1976'} +{'loss': '0.5711', 'grad_norm': '2.051', 'learning_rate': '2.652e-05', 'epoch': '0.05308', 'num_input_tokens_seen': 4315076, 'train_runtime': '2183', 'train_tokens_per_second': '1976'} +{'loss': '0.6141', 'grad_norm': '2.251', 'learning_rate': '2.654e-05', 'epoch': '0.0531', 'num_input_tokens_seen': 4317123, 'train_runtime': '2184', 'train_tokens_per_second': '1976'} +{'loss': '0.7515', 'grad_norm': '2.702', 'learning_rate': '2.655e-05', 'epoch': '0.05313', 'num_input_tokens_seen': 4319170, 'train_runtime': '2185', 'train_tokens_per_second': '1976'} +{'loss': '1.018', 'grad_norm': '2.965', 'learning_rate': '2.656e-05', 'epoch': '0.05315', 'num_input_tokens_seen': 4321217, 'train_runtime': '2186', 'train_tokens_per_second': '1976'} +{'loss': '0.6554', 'grad_norm': '2.561', 'learning_rate': '2.657e-05', 'epoch': '0.05318', 'num_input_tokens_seen': 4323264, 'train_runtime': '2187', 'train_tokens_per_second': '1976'} +{'loss': '0.9883', 'grad_norm': '3.308', 'learning_rate': '2.659e-05', 'epoch': '0.0532', 'num_input_tokens_seen': 4325311, 'train_runtime': '2188', 'train_tokens_per_second': '1976'} +{'loss': '0.908', 'grad_norm': '2.287', 'learning_rate': '2.66e-05', 'epoch': '0.05323', 'num_input_tokens_seen': 4327358, 'train_runtime': '2189', 'train_tokens_per_second': '1976'} +{'loss': '2.507', 'grad_norm': '3.331', 'learning_rate': '2.661e-05', 'epoch': '0.05325', 'num_input_tokens_seen': 4329405, 'train_runtime': '2190', 'train_tokens_per_second': '1976'} +{'loss': '2.712', 'grad_norm': '3.741', 'learning_rate': '2.662e-05', 'epoch': '0.05328', 'num_input_tokens_seen': 4331452, 'train_runtime': '2191', 'train_tokens_per_second': '1976'} +{'loss': '2.437', 'grad_norm': '3.127', 'learning_rate': '2.664e-05', 'epoch': '0.0533', 'num_input_tokens_seen': 4333499, 'train_runtime': '2193', 'train_tokens_per_second': '1976'} +{'loss': '0.7674', 'grad_norm': '1.874', 'learning_rate': '2.665e-05', 'epoch': '0.05333', 'num_input_tokens_seen': 4335546, 'train_runtime': '2194', 'train_tokens_per_second': '1976'} +{'loss': '0.6973', 'grad_norm': '1.717', 'learning_rate': '2.666e-05', 'epoch': '0.05335', 'num_input_tokens_seen': 4337593, 'train_runtime': '2195', 'train_tokens_per_second': '1976'} +{'loss': '1.063', 'grad_norm': '2.242', 'learning_rate': '2.667e-05', 'epoch': '0.05338', 'num_input_tokens_seen': 4339640, 'train_runtime': '2196', 'train_tokens_per_second': '1976'} +{'loss': '2.432', 'grad_norm': '3.535', 'learning_rate': '2.669e-05', 'epoch': '0.0534', 'num_input_tokens_seen': 4341687, 'train_runtime': '2197', 'train_tokens_per_second': '1976'} +{'loss': '0.697', 'grad_norm': '2.04', 'learning_rate': '2.67e-05', 'epoch': '0.05343', 'num_input_tokens_seen': 4343734, 'train_runtime': '2198', 'train_tokens_per_second': '1976'} +{'loss': '1.282', 'grad_norm': '2.844', 'learning_rate': '2.671e-05', 'epoch': '0.05345', 'num_input_tokens_seen': 4345781, 'train_runtime': '2199', 'train_tokens_per_second': '1977'} +{'loss': '0.3879', 'grad_norm': '1.559', 'learning_rate': '2.672e-05', 'epoch': '0.05348', 'num_input_tokens_seen': 4347828, 'train_runtime': '2200', 'train_tokens_per_second': '1977'} +{'loss': '0.4964', 'grad_norm': '1.835', 'learning_rate': '2.674e-05', 'epoch': '0.0535', 'num_input_tokens_seen': 4349875, 'train_runtime': '2201', 'train_tokens_per_second': '1977'} +{'loss': '1.533', 'grad_norm': '2.822', 'learning_rate': '2.675e-05', 'epoch': '0.05353', 'num_input_tokens_seen': 4351922, 'train_runtime': '2202', 'train_tokens_per_second': '1977'} +{'loss': '0.6812', 'grad_norm': '1.743', 'learning_rate': '2.676e-05', 'epoch': '0.05355', 'num_input_tokens_seen': 4353969, 'train_runtime': '2203', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '2.538', 'learning_rate': '2.677e-05', 'epoch': '0.05358', 'num_input_tokens_seen': 4356016, 'train_runtime': '2204', 'train_tokens_per_second': '1977'} +{'loss': '1.352', 'grad_norm': '2.665', 'learning_rate': '2.679e-05', 'epoch': '0.0536', 'num_input_tokens_seen': 4358063, 'train_runtime': '2205', 'train_tokens_per_second': '1977'} +{'loss': '0.6385', 'grad_norm': '2.518', 'learning_rate': '2.68e-05', 'epoch': '0.05363', 'num_input_tokens_seen': 4360110, 'train_runtime': '2206', 'train_tokens_per_second': '1977'} +{'loss': '0.7358', 'grad_norm': '2.144', 'learning_rate': '2.681e-05', 'epoch': '0.05365', 'num_input_tokens_seen': 4362157, 'train_runtime': '2207', 'train_tokens_per_second': '1977'} +{'loss': '2.086', 'grad_norm': '3.25', 'learning_rate': '2.683e-05', 'epoch': '0.05368', 'num_input_tokens_seen': 4364204, 'train_runtime': '2208', 'train_tokens_per_second': '1977'} +{'loss': '0.3976', 'grad_norm': '1.649', 'learning_rate': '2.684e-05', 'epoch': '0.0537', 'num_input_tokens_seen': 4366251, 'train_runtime': '2209', 'train_tokens_per_second': '1977'} +{'loss': '0.9944', 'grad_norm': '2.779', 'learning_rate': '2.685e-05', 'epoch': '0.05373', 'num_input_tokens_seen': 4368298, 'train_runtime': '2210', 'train_tokens_per_second': '1977'} +{'loss': '0.5877', 'grad_norm': '2.165', 'learning_rate': '2.686e-05', 'epoch': '0.05376', 'num_input_tokens_seen': 4370345, 'train_runtime': '2211', 'train_tokens_per_second': '1977'} +{'loss': '1.995', 'grad_norm': '4.286', 'learning_rate': '2.688e-05', 'epoch': '0.05378', 'num_input_tokens_seen': 4372392, 'train_runtime': '2212', 'train_tokens_per_second': '1977'} +{'loss': '0.6868', 'grad_norm': '2.462', 'learning_rate': '2.689e-05', 'epoch': '0.05381', 'num_input_tokens_seen': 4374439, 'train_runtime': '2213', 'train_tokens_per_second': '1977'} +{'loss': '0.5245', 'grad_norm': '2.026', 'learning_rate': '2.69e-05', 'epoch': '0.05383', 'num_input_tokens_seen': 4376486, 'train_runtime': '2214', 'train_tokens_per_second': '1977'} +{'loss': '0.9037', 'grad_norm': '2.564', 'learning_rate': '2.691e-05', 'epoch': '0.05386', 'num_input_tokens_seen': 4378533, 'train_runtime': '2215', 'train_tokens_per_second': '1977'} +{'loss': '1.659', 'grad_norm': '3.486', 'learning_rate': '2.693e-05', 'epoch': '0.05388', 'num_input_tokens_seen': 4380580, 'train_runtime': '2216', 'train_tokens_per_second': '1977'} +{'loss': '0.937', 'grad_norm': '3.484', 'learning_rate': '2.694e-05', 'epoch': '0.05391', 'num_input_tokens_seen': 4382627, 'train_runtime': '2217', 'train_tokens_per_second': '1977'} +{'loss': '1.249', 'grad_norm': '2.991', 'learning_rate': '2.695e-05', 'epoch': '0.05393', 'num_input_tokens_seen': 4384674, 'train_runtime': '2218', 'train_tokens_per_second': '1977'} +{'loss': '0.9942', 'grad_norm': '2.302', 'learning_rate': '2.696e-05', 'epoch': '0.05396', 'num_input_tokens_seen': 4386721, 'train_runtime': '2219', 'train_tokens_per_second': '1977'} +{'loss': '1.891', 'grad_norm': '3.312', 'learning_rate': '2.698e-05', 'epoch': '0.05398', 'num_input_tokens_seen': 4388768, 'train_runtime': '2220', 'train_tokens_per_second': '1977'} +{'loss': '1.023', 'grad_norm': '2.228', 'learning_rate': '2.699e-05', 'epoch': '0.05401', 'num_input_tokens_seen': 4390815, 'train_runtime': '2221', 'train_tokens_per_second': '1977'} +{'loss': '2.081', 'grad_norm': '3.19', 'learning_rate': '2.7e-05', 'epoch': '0.05403', 'num_input_tokens_seen': 4392862, 'train_runtime': '2223', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '2.555', 'learning_rate': '2.701e-05', 'epoch': '0.05406', 'num_input_tokens_seen': 4394909, 'train_runtime': '2224', 'train_tokens_per_second': '1977'} +{'loss': '0.4165', 'grad_norm': '1.425', 'learning_rate': '2.703e-05', 'epoch': '0.05408', 'num_input_tokens_seen': 4396956, 'train_runtime': '2225', 'train_tokens_per_second': '1977'} +{'loss': '2.153', 'grad_norm': '3.394', 'learning_rate': '2.704e-05', 'epoch': '0.05411', 'num_input_tokens_seen': 4399003, 'train_runtime': '2226', 'train_tokens_per_second': '1977'} +{'loss': '1.402', 'grad_norm': '3.017', 'learning_rate': '2.705e-05', 'epoch': '0.05413', 'num_input_tokens_seen': 4401050, 'train_runtime': '2227', 'train_tokens_per_second': '1977'} +{'loss': '0.9538', 'grad_norm': '3.03', 'learning_rate': '2.706e-05', 'epoch': '0.05416', 'num_input_tokens_seen': 4403097, 'train_runtime': '2228', 'train_tokens_per_second': '1977'} +{'loss': '0.5092', 'grad_norm': '2.805', 'learning_rate': '2.708e-05', 'epoch': '0.05418', 'num_input_tokens_seen': 4405144, 'train_runtime': '2229', 'train_tokens_per_second': '1977'} +{'loss': '2.111', 'grad_norm': '3.191', 'learning_rate': '2.709e-05', 'epoch': '0.05421', 'num_input_tokens_seen': 4407191, 'train_runtime': '2230', 'train_tokens_per_second': '1977'} +{'loss': '0.559', 'grad_norm': '2.181', 'learning_rate': '2.71e-05', 'epoch': '0.05423', 'num_input_tokens_seen': 4409238, 'train_runtime': '2231', 'train_tokens_per_second': '1977'} +{'loss': '1.396', 'grad_norm': '2.995', 'learning_rate': '2.711e-05', 'epoch': '0.05426', 'num_input_tokens_seen': 4411285, 'train_runtime': '2232', 'train_tokens_per_second': '1977'} +{'loss': '0.9049', 'grad_norm': '2.364', 'learning_rate': '2.713e-05', 'epoch': '0.05428', 'num_input_tokens_seen': 4413332, 'train_runtime': '2233', 'train_tokens_per_second': '1977'} +{'loss': '0.8542', 'grad_norm': '2.74', 'learning_rate': '2.714e-05', 'epoch': '0.05431', 'num_input_tokens_seen': 4415379, 'train_runtime': '2234', 'train_tokens_per_second': '1977'} +{'loss': '2.798', 'grad_norm': '4.224', 'learning_rate': '2.715e-05', 'epoch': '0.05433', 'num_input_tokens_seen': 4417426, 'train_runtime': '2235', 'train_tokens_per_second': '1977'} +{'loss': '0.8381', 'grad_norm': '3.193', 'learning_rate': '2.717e-05', 'epoch': '0.05436', 'num_input_tokens_seen': 4419473, 'train_runtime': '2236', 'train_tokens_per_second': '1977'} +{'loss': '2.372', 'grad_norm': '3.629', 'learning_rate': '2.718e-05', 'epoch': '0.05438', 'num_input_tokens_seen': 4421520, 'train_runtime': '2237', 'train_tokens_per_second': '1977'} +{'loss': '2.088', 'grad_norm': '3.47', 'learning_rate': '2.719e-05', 'epoch': '0.05441', 'num_input_tokens_seen': 4423567, 'train_runtime': '2238', 'train_tokens_per_second': '1977'} +{'loss': '1.612', 'grad_norm': '3.016', 'learning_rate': '2.72e-05', 'epoch': '0.05444', 'num_input_tokens_seen': 4425614, 'train_runtime': '2239', 'train_tokens_per_second': '1977'} +{'loss': '1.777', 'grad_norm': '3.534', 'learning_rate': '2.722e-05', 'epoch': '0.05446', 'num_input_tokens_seen': 4427661, 'train_runtime': '2240', 'train_tokens_per_second': '1977'} +{'loss': '1.125', 'grad_norm': '2.544', 'learning_rate': '2.723e-05', 'epoch': '0.05449', 'num_input_tokens_seen': 4429708, 'train_runtime': '2241', 'train_tokens_per_second': '1977'} +{'loss': '0.5249', 'grad_norm': '2.372', 'learning_rate': '2.724e-05', 'epoch': '0.05451', 'num_input_tokens_seen': 4431755, 'train_runtime': '2242', 'train_tokens_per_second': '1977'} +{'loss': '0.8638', 'grad_norm': '2.253', 'learning_rate': '2.725e-05', 'epoch': '0.05454', 'num_input_tokens_seen': 4433802, 'train_runtime': '2243', 'train_tokens_per_second': '1977'} +{'loss': '0.4928', 'grad_norm': '1.377', 'learning_rate': '2.727e-05', 'epoch': '0.05456', 'num_input_tokens_seen': 4435849, 'train_runtime': '2244', 'train_tokens_per_second': '1977'} +{'loss': '0.9111', 'grad_norm': '2.266', 'learning_rate': '2.728e-05', 'epoch': '0.05459', 'num_input_tokens_seen': 4437896, 'train_runtime': '2245', 'train_tokens_per_second': '1977'} +{'loss': '1.759', 'grad_norm': '3.461', 'learning_rate': '2.729e-05', 'epoch': '0.05461', 'num_input_tokens_seen': 4439943, 'train_runtime': '2246', 'train_tokens_per_second': '1977'} +{'loss': '1.49', 'grad_norm': '2.899', 'learning_rate': '2.73e-05', 'epoch': '0.05464', 'num_input_tokens_seen': 4441990, 'train_runtime': '2247', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '2.877', 'learning_rate': '2.732e-05', 'epoch': '0.05466', 'num_input_tokens_seen': 4444037, 'train_runtime': '2248', 'train_tokens_per_second': '1977'} +{'loss': '0.4647', 'grad_norm': '1.883', 'learning_rate': '2.733e-05', 'epoch': '0.05469', 'num_input_tokens_seen': 4446084, 'train_runtime': '2249', 'train_tokens_per_second': '1977'} +{'loss': '0.7551', 'grad_norm': '2.252', 'learning_rate': '2.734e-05', 'epoch': '0.05471', 'num_input_tokens_seen': 4448131, 'train_runtime': '2250', 'train_tokens_per_second': '1977'} +{'loss': '1.098', 'grad_norm': '2.353', 'learning_rate': '2.735e-05', 'epoch': '0.05474', 'num_input_tokens_seen': 4450178, 'train_runtime': '2251', 'train_tokens_per_second': '1977'} +{'loss': '0.8323', 'grad_norm': '2.183', 'learning_rate': '2.737e-05', 'epoch': '0.05476', 'num_input_tokens_seen': 4452225, 'train_runtime': '2253', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '2.432', 'learning_rate': '2.738e-05', 'epoch': '0.05479', 'num_input_tokens_seen': 4454272, 'train_runtime': '2254', 'train_tokens_per_second': '1977'} +{'loss': '1.565', 'grad_norm': '2.761', 'learning_rate': '2.739e-05', 'epoch': '0.05481', 'num_input_tokens_seen': 4456319, 'train_runtime': '2255', 'train_tokens_per_second': '1977'} +{'loss': '0.534', 'grad_norm': '1.854', 'learning_rate': '2.74e-05', 'epoch': '0.05484', 'num_input_tokens_seen': 4458366, 'train_runtime': '2256', 'train_tokens_per_second': '1977'} +{'loss': '0.7724', 'grad_norm': '2.21', 'learning_rate': '2.742e-05', 'epoch': '0.05486', 'num_input_tokens_seen': 4460413, 'train_runtime': '2257', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '3.373', 'learning_rate': '2.743e-05', 'epoch': '0.05489', 'num_input_tokens_seen': 4462460, 'train_runtime': '2258', 'train_tokens_per_second': '1977'} +{'loss': '0.8286', 'grad_norm': '1.882', 'learning_rate': '2.744e-05', 'epoch': '0.05491', 'num_input_tokens_seen': 4464507, 'train_runtime': '2259', 'train_tokens_per_second': '1977'} +{'loss': '0.9116', 'grad_norm': '2.043', 'learning_rate': '2.745e-05', 'epoch': '0.05494', 'num_input_tokens_seen': 4466554, 'train_runtime': '2260', 'train_tokens_per_second': '1977'} +{'loss': '0.8495', 'grad_norm': '2.142', 'learning_rate': '2.747e-05', 'epoch': '0.05496', 'num_input_tokens_seen': 4468601, 'train_runtime': '2261', 'train_tokens_per_second': '1977'} +{'loss': '0.4376', 'grad_norm': '1.629', 'learning_rate': '2.748e-05', 'epoch': '0.05499', 'num_input_tokens_seen': 4470648, 'train_runtime': '2262', 'train_tokens_per_second': '1977'} +{'loss': '0.8186', 'grad_norm': '2.358', 'learning_rate': '2.749e-05', 'epoch': '0.05501', 'num_input_tokens_seen': 4472695, 'train_runtime': '2263', 'train_tokens_per_second': '1977'} +{'loss': '1.63', 'grad_norm': '3.287', 'learning_rate': '2.751e-05', 'epoch': '0.05504', 'num_input_tokens_seen': 4474742, 'train_runtime': '2264', 'train_tokens_per_second': '1977'} +{'loss': '1.428', 'grad_norm': '2.632', 'learning_rate': '2.752e-05', 'epoch': '0.05506', 'num_input_tokens_seen': 4476789, 'train_runtime': '2265', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '2.434', 'learning_rate': '2.753e-05', 'epoch': '0.05509', 'num_input_tokens_seen': 4478836, 'train_runtime': '2266', 'train_tokens_per_second': '1977'} +{'loss': '1.375', 'grad_norm': '3.118', 'learning_rate': '2.754e-05', 'epoch': '0.05511', 'num_input_tokens_seen': 4480883, 'train_runtime': '2267', 'train_tokens_per_second': '1977'} +{'loss': '0.6327', 'grad_norm': '2.461', 'learning_rate': '2.756e-05', 'epoch': '0.05514', 'num_input_tokens_seen': 4482930, 'train_runtime': '2268', 'train_tokens_per_second': '1977'} +{'loss': '0.7111', 'grad_norm': '2.122', 'learning_rate': '2.757e-05', 'epoch': '0.05517', 'num_input_tokens_seen': 4484977, 'train_runtime': '2269', 'train_tokens_per_second': '1977'} +{'loss': '0.942', 'grad_norm': '3.154', 'learning_rate': '2.758e-05', 'epoch': '0.05519', 'num_input_tokens_seen': 4487024, 'train_runtime': '2270', 'train_tokens_per_second': '1977'} +{'loss': '1.135', 'grad_norm': '2.755', 'learning_rate': '2.759e-05', 'epoch': '0.05522', 'num_input_tokens_seen': 4489071, 'train_runtime': '2271', 'train_tokens_per_second': '1977'} +{'loss': '0.5944', 'grad_norm': '2.241', 'learning_rate': '2.761e-05', 'epoch': '0.05524', 'num_input_tokens_seen': 4491118, 'train_runtime': '2272', 'train_tokens_per_second': '1977'} +{'loss': '1.156', 'grad_norm': '2.722', 'learning_rate': '2.762e-05', 'epoch': '0.05527', 'num_input_tokens_seen': 4493165, 'train_runtime': '2273', 'train_tokens_per_second': '1977'} +{'loss': '1.358', 'grad_norm': '3.494', 'learning_rate': '2.763e-05', 'epoch': '0.05529', 'num_input_tokens_seen': 4495212, 'train_runtime': '2274', 'train_tokens_per_second': '1977'} +{'loss': '1.541', 'grad_norm': '2.807', 'learning_rate': '2.764e-05', 'epoch': '0.05532', 'num_input_tokens_seen': 4497259, 'train_runtime': '2275', 'train_tokens_per_second': '1977'} +{'loss': '1.682', 'grad_norm': '2.849', 'learning_rate': '2.766e-05', 'epoch': '0.05534', 'num_input_tokens_seen': 4499306, 'train_runtime': '2276', 'train_tokens_per_second': '1977'} +{'loss': '0.6454', 'grad_norm': '2.063', 'learning_rate': '2.767e-05', 'epoch': '0.05537', 'num_input_tokens_seen': 4501353, 'train_runtime': '2277', 'train_tokens_per_second': '1977'} +{'loss': '1.719', 'grad_norm': '2.901', 'learning_rate': '2.768e-05', 'epoch': '0.05539', 'num_input_tokens_seen': 4503400, 'train_runtime': '2278', 'train_tokens_per_second': '1977'} +{'loss': '1.32', 'grad_norm': '3.266', 'learning_rate': '2.769e-05', 'epoch': '0.05542', 'num_input_tokens_seen': 4505447, 'train_runtime': '2279', 'train_tokens_per_second': '1977'} +{'loss': '1.048', 'grad_norm': '2.533', 'learning_rate': '2.771e-05', 'epoch': '0.05544', 'num_input_tokens_seen': 4507494, 'train_runtime': '2280', 'train_tokens_per_second': '1977'} +{'loss': '0.6893', 'grad_norm': '2.091', 'learning_rate': '2.772e-05', 'epoch': '0.05547', 'num_input_tokens_seen': 4509541, 'train_runtime': '2281', 'train_tokens_per_second': '1977'} +{'loss': '0.5915', 'grad_norm': '1.849', 'learning_rate': '2.773e-05', 'epoch': '0.05549', 'num_input_tokens_seen': 4511588, 'train_runtime': '2282', 'train_tokens_per_second': '1977'} +{'loss': '0.8475', 'grad_norm': '1.968', 'learning_rate': '2.774e-05', 'epoch': '0.05552', 'num_input_tokens_seen': 4513635, 'train_runtime': '2284', 'train_tokens_per_second': '1977'} +{'loss': '1.368', 'grad_norm': '2.983', 'learning_rate': '2.776e-05', 'epoch': '0.05554', 'num_input_tokens_seen': 4515682, 'train_runtime': '2285', 'train_tokens_per_second': '1977'} +{'loss': '0.6376', 'grad_norm': '1.96', 'learning_rate': '2.777e-05', 'epoch': '0.05557', 'num_input_tokens_seen': 4517729, 'train_runtime': '2286', 'train_tokens_per_second': '1977'} +{'loss': '0.9722', 'grad_norm': '2.557', 'learning_rate': '2.778e-05', 'epoch': '0.05559', 'num_input_tokens_seen': 4519776, 'train_runtime': '2287', 'train_tokens_per_second': '1977'} +{'loss': '0.823', 'grad_norm': '2.306', 'learning_rate': '2.779e-05', 'epoch': '0.05562', 'num_input_tokens_seen': 4521823, 'train_runtime': '2288', 'train_tokens_per_second': '1977'} +{'loss': '2.065', 'grad_norm': '3.94', 'learning_rate': '2.781e-05', 'epoch': '0.05564', 'num_input_tokens_seen': 4523870, 'train_runtime': '2289', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '2.819', 'learning_rate': '2.782e-05', 'epoch': '0.05567', 'num_input_tokens_seen': 4525917, 'train_runtime': '2290', 'train_tokens_per_second': '1977'} +{'loss': '0.3981', 'grad_norm': '1.835', 'learning_rate': '2.783e-05', 'epoch': '0.05569', 'num_input_tokens_seen': 4527964, 'train_runtime': '2291', 'train_tokens_per_second': '1977'} +{'loss': '0.6866', 'grad_norm': '1.992', 'learning_rate': '2.784e-05', 'epoch': '0.05572', 'num_input_tokens_seen': 4530011, 'train_runtime': '2292', 'train_tokens_per_second': '1977'} +{'loss': '1.976', 'grad_norm': '3.275', 'learning_rate': '2.786e-05', 'epoch': '0.05574', 'num_input_tokens_seen': 4532058, 'train_runtime': '2293', 'train_tokens_per_second': '1977'} +{'loss': '3.596', 'grad_norm': '4.174', 'learning_rate': '2.787e-05', 'epoch': '0.05577', 'num_input_tokens_seen': 4534105, 'train_runtime': '2294', 'train_tokens_per_second': '1977'} +{'loss': '1.329', 'grad_norm': '2.76', 'learning_rate': '2.788e-05', 'epoch': '0.05579', 'num_input_tokens_seen': 4536152, 'train_runtime': '2295', 'train_tokens_per_second': '1977'} +{'loss': '0.5297', 'grad_norm': '1.766', 'learning_rate': '2.79e-05', 'epoch': '0.05582', 'num_input_tokens_seen': 4538199, 'train_runtime': '2296', 'train_tokens_per_second': '1977'} +{'loss': '1.572', 'grad_norm': '3.021', 'learning_rate': '2.791e-05', 'epoch': '0.05585', 'num_input_tokens_seen': 4540246, 'train_runtime': '2297', 'train_tokens_per_second': '1977'} +{'loss': '0.7464', 'grad_norm': '2.591', 'learning_rate': '2.792e-05', 'epoch': '0.05587', 'num_input_tokens_seen': 4542293, 'train_runtime': '2298', 'train_tokens_per_second': '1977'} +{'loss': '0.781', 'grad_norm': '2.946', 'learning_rate': '2.793e-05', 'epoch': '0.0559', 'num_input_tokens_seen': 4544340, 'train_runtime': '2299', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '2.395', 'learning_rate': '2.795e-05', 'epoch': '0.05592', 'num_input_tokens_seen': 4546387, 'train_runtime': '2300', 'train_tokens_per_second': '1977'} +{'loss': '0.4757', 'grad_norm': '1.572', 'learning_rate': '2.796e-05', 'epoch': '0.05595', 'num_input_tokens_seen': 4548434, 'train_runtime': '2301', 'train_tokens_per_second': '1977'} +{'loss': '0.6053', 'grad_norm': '1.956', 'learning_rate': '2.797e-05', 'epoch': '0.05597', 'num_input_tokens_seen': 4550481, 'train_runtime': '2302', 'train_tokens_per_second': '1977'} +{'loss': '1.572', 'grad_norm': '2.851', 'learning_rate': '2.798e-05', 'epoch': '0.056', 'num_input_tokens_seen': 4552528, 'train_runtime': '2303', 'train_tokens_per_second': '1977'} +{'loss': '0.5264', 'grad_norm': '1.922', 'learning_rate': '2.8e-05', 'epoch': '0.05602', 'num_input_tokens_seen': 4554575, 'train_runtime': '2304', 'train_tokens_per_second': '1977'} +{'loss': '1.145', 'grad_norm': '2.54', 'learning_rate': '2.801e-05', 'epoch': '0.05605', 'num_input_tokens_seen': 4556622, 'train_runtime': '2305', 'train_tokens_per_second': '1977'} +{'loss': '2.489', 'grad_norm': '3.354', 'learning_rate': '2.802e-05', 'epoch': '0.05607', 'num_input_tokens_seen': 4558669, 'train_runtime': '2306', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '3.077', 'learning_rate': '2.803e-05', 'epoch': '0.0561', 'num_input_tokens_seen': 4560716, 'train_runtime': '2307', 'train_tokens_per_second': '1977'} +{'loss': '0.5157', 'grad_norm': '1.673', 'learning_rate': '2.805e-05', 'epoch': '0.05612', 'num_input_tokens_seen': 4562763, 'train_runtime': '2308', 'train_tokens_per_second': '1977'} +{'loss': '0.7209', 'grad_norm': '2.08', 'learning_rate': '2.806e-05', 'epoch': '0.05615', 'num_input_tokens_seen': 4564810, 'train_runtime': '2309', 'train_tokens_per_second': '1977'} +{'loss': '1.28', 'grad_norm': '2.956', 'learning_rate': '2.807e-05', 'epoch': '0.05617', 'num_input_tokens_seen': 4566857, 'train_runtime': '2310', 'train_tokens_per_second': '1977'} +{'loss': '1.117', 'grad_norm': '6.125', 'learning_rate': '2.808e-05', 'epoch': '0.0562', 'num_input_tokens_seen': 4568904, 'train_runtime': '2311', 'train_tokens_per_second': '1977'} +{'loss': '0.6965', 'grad_norm': '2.003', 'learning_rate': '2.81e-05', 'epoch': '0.05622', 'num_input_tokens_seen': 4570951, 'train_runtime': '2312', 'train_tokens_per_second': '1977'} +{'loss': '0.9772', 'grad_norm': '2.543', 'learning_rate': '2.811e-05', 'epoch': '0.05625', 'num_input_tokens_seen': 4572998, 'train_runtime': '2313', 'train_tokens_per_second': '1977'} +{'loss': '0.4303', 'grad_norm': '1.508', 'learning_rate': '2.812e-05', 'epoch': '0.05627', 'num_input_tokens_seen': 4575045, 'train_runtime': '2315', 'train_tokens_per_second': '1977'} +{'loss': '0.6825', 'grad_norm': '2.281', 'learning_rate': '2.813e-05', 'epoch': '0.0563', 'num_input_tokens_seen': 4577092, 'train_runtime': '2316', 'train_tokens_per_second': '1977'} +{'loss': '1.353', 'grad_norm': '3.494', 'learning_rate': '2.815e-05', 'epoch': '0.05632', 'num_input_tokens_seen': 4579139, 'train_runtime': '2317', 'train_tokens_per_second': '1977'} +{'loss': '0.5551', 'grad_norm': '1.962', 'learning_rate': '2.816e-05', 'epoch': '0.05635', 'num_input_tokens_seen': 4581186, 'train_runtime': '2318', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '2.591', 'learning_rate': '2.817e-05', 'epoch': '0.05637', 'num_input_tokens_seen': 4583233, 'train_runtime': '2319', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '3.064', 'learning_rate': '2.818e-05', 'epoch': '0.0564', 'num_input_tokens_seen': 4585280, 'train_runtime': '2320', 'train_tokens_per_second': '1977'} +{'loss': '1.554', 'grad_norm': '3.17', 'learning_rate': '2.82e-05', 'epoch': '0.05642', 'num_input_tokens_seen': 4587327, 'train_runtime': '2321', 'train_tokens_per_second': '1977'} +{'loss': '1.564', 'grad_norm': '3.329', 'learning_rate': '2.821e-05', 'epoch': '0.05645', 'num_input_tokens_seen': 4589374, 'train_runtime': '2322', 'train_tokens_per_second': '1977'} +{'loss': '1.451', 'grad_norm': '3.818', 'learning_rate': '2.822e-05', 'epoch': '0.05647', 'num_input_tokens_seen': 4591421, 'train_runtime': '2323', 'train_tokens_per_second': '1977'} +{'loss': '0.4343', 'grad_norm': '2.008', 'learning_rate': '2.824e-05', 'epoch': '0.0565', 'num_input_tokens_seen': 4593468, 'train_runtime': '2324', 'train_tokens_per_second': '1977'} +{'loss': '0.4902', 'grad_norm': '1.649', 'learning_rate': '2.825e-05', 'epoch': '0.05652', 'num_input_tokens_seen': 4595515, 'train_runtime': '2325', 'train_tokens_per_second': '1977'} +{'loss': '0.94', 'grad_norm': '2.394', 'learning_rate': '2.826e-05', 'epoch': '0.05655', 'num_input_tokens_seen': 4597562, 'train_runtime': '2326', 'train_tokens_per_second': '1977'} +{'loss': '0.5034', 'grad_norm': '2.27', 'learning_rate': '2.827e-05', 'epoch': '0.05658', 'num_input_tokens_seen': 4599609, 'train_runtime': '2327', 'train_tokens_per_second': '1977'} +{'loss': '0.9286', 'grad_norm': '2.532', 'learning_rate': '2.829e-05', 'epoch': '0.0566', 'num_input_tokens_seen': 4601656, 'train_runtime': '2328', 'train_tokens_per_second': '1977'} +{'loss': '0.7773', 'grad_norm': '2.079', 'learning_rate': '2.83e-05', 'epoch': '0.05663', 'num_input_tokens_seen': 4603703, 'train_runtime': '2329', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '3.278', 'learning_rate': '2.831e-05', 'epoch': '0.05665', 'num_input_tokens_seen': 4605750, 'train_runtime': '2330', 'train_tokens_per_second': '1977'} +{'loss': '0.8704', 'grad_norm': '2.852', 'learning_rate': '2.832e-05', 'epoch': '0.05668', 'num_input_tokens_seen': 4607797, 'train_runtime': '2331', 'train_tokens_per_second': '1977'} +{'loss': '0.7239', 'grad_norm': '1.722', 'learning_rate': '2.834e-05', 'epoch': '0.0567', 'num_input_tokens_seen': 4609844, 'train_runtime': '2332', 'train_tokens_per_second': '1977'} +{'loss': '0.9152', 'grad_norm': '2.709', 'learning_rate': '2.835e-05', 'epoch': '0.05673', 'num_input_tokens_seen': 4611891, 'train_runtime': '2333', 'train_tokens_per_second': '1977'} +{'loss': '1.22', 'grad_norm': '2.855', 'learning_rate': '2.836e-05', 'epoch': '0.05675', 'num_input_tokens_seen': 4613938, 'train_runtime': '2334', 'train_tokens_per_second': '1977'} +{'loss': '0.9895', 'grad_norm': '2.244', 'learning_rate': '2.837e-05', 'epoch': '0.05678', 'num_input_tokens_seen': 4615985, 'train_runtime': '2335', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '2.341', 'learning_rate': '2.839e-05', 'epoch': '0.0568', 'num_input_tokens_seen': 4618032, 'train_runtime': '2336', 'train_tokens_per_second': '1977'} +{'loss': '1.125', 'grad_norm': '3.386', 'learning_rate': '2.84e-05', 'epoch': '0.05683', 'num_input_tokens_seen': 4620079, 'train_runtime': '2337', 'train_tokens_per_second': '1977'} +{'loss': '0.5035', 'grad_norm': '1.827', 'learning_rate': '2.841e-05', 'epoch': '0.05685', 'num_input_tokens_seen': 4622126, 'train_runtime': '2338', 'train_tokens_per_second': '1977'} +{'loss': '2.163', 'grad_norm': '3.444', 'learning_rate': '2.842e-05', 'epoch': '0.05688', 'num_input_tokens_seen': 4624173, 'train_runtime': '2339', 'train_tokens_per_second': '1977'} +{'loss': '1.031', 'grad_norm': '2.364', 'learning_rate': '2.844e-05', 'epoch': '0.0569', 'num_input_tokens_seen': 4626220, 'train_runtime': '2340', 'train_tokens_per_second': '1977'} +{'loss': '0.4935', 'grad_norm': '2.075', 'learning_rate': '2.845e-05', 'epoch': '0.05693', 'num_input_tokens_seen': 4628267, 'train_runtime': '2341', 'train_tokens_per_second': '1977'} +{'loss': '0.9783', 'grad_norm': '3.421', 'learning_rate': '2.846e-05', 'epoch': '0.05695', 'num_input_tokens_seen': 4630314, 'train_runtime': '2342', 'train_tokens_per_second': '1977'} +{'loss': '1.548', 'grad_norm': '3.032', 'learning_rate': '2.847e-05', 'epoch': '0.05698', 'num_input_tokens_seen': 4632361, 'train_runtime': '2343', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '2.968', 'learning_rate': '2.849e-05', 'epoch': '0.057', 'num_input_tokens_seen': 4634408, 'train_runtime': '2344', 'train_tokens_per_second': '1977'} +{'loss': '1.343', 'grad_norm': '2.792', 'learning_rate': '2.85e-05', 'epoch': '0.05703', 'num_input_tokens_seen': 4636455, 'train_runtime': '2346', 'train_tokens_per_second': '1977'} +{'loss': '0.5815', 'grad_norm': '2.223', 'learning_rate': '2.851e-05', 'epoch': '0.05705', 'num_input_tokens_seen': 4638502, 'train_runtime': '2347', 'train_tokens_per_second': '1977'} +{'loss': '0.8913', 'grad_norm': '2.026', 'learning_rate': '2.852e-05', 'epoch': '0.05708', 'num_input_tokens_seen': 4640549, 'train_runtime': '2348', 'train_tokens_per_second': '1977'} +{'loss': '0.6532', 'grad_norm': '1.907', 'learning_rate': '2.854e-05', 'epoch': '0.0571', 'num_input_tokens_seen': 4642596, 'train_runtime': '2349', 'train_tokens_per_second': '1977'} +{'loss': '0.909', 'grad_norm': '2.43', 'learning_rate': '2.855e-05', 'epoch': '0.05713', 'num_input_tokens_seen': 4644643, 'train_runtime': '2350', 'train_tokens_per_second': '1977'} +{'loss': '0.6908', 'grad_norm': '2.484', 'learning_rate': '2.856e-05', 'epoch': '0.05715', 'num_input_tokens_seen': 4646690, 'train_runtime': '2351', 'train_tokens_per_second': '1977'} +{'loss': '0.969', 'grad_norm': '2.837', 'learning_rate': '2.858e-05', 'epoch': '0.05718', 'num_input_tokens_seen': 4648737, 'train_runtime': '2352', 'train_tokens_per_second': '1977'} +{'loss': '1.889', 'grad_norm': '3.3', 'learning_rate': '2.859e-05', 'epoch': '0.0572', 'num_input_tokens_seen': 4650784, 'train_runtime': '2353', 'train_tokens_per_second': '1977'} +{'loss': '1.231', 'grad_norm': '2.922', 'learning_rate': '2.86e-05', 'epoch': '0.05723', 'num_input_tokens_seen': 4652831, 'train_runtime': '2354', 'train_tokens_per_second': '1977'} +{'loss': '0.9207', 'grad_norm': '2.453', 'learning_rate': '2.861e-05', 'epoch': '0.05726', 'num_input_tokens_seen': 4654878, 'train_runtime': '2355', 'train_tokens_per_second': '1977'} +{'loss': '0.7537', 'grad_norm': '2.172', 'learning_rate': '2.863e-05', 'epoch': '0.05728', 'num_input_tokens_seen': 4656925, 'train_runtime': '2356', 'train_tokens_per_second': '1977'} +{'loss': '0.3091', 'grad_norm': '1.579', 'learning_rate': '2.864e-05', 'epoch': '0.05731', 'num_input_tokens_seen': 4658972, 'train_runtime': '2357', 'train_tokens_per_second': '1977'} +{'loss': '0.6474', 'grad_norm': '2.563', 'learning_rate': '2.865e-05', 'epoch': '0.05733', 'num_input_tokens_seen': 4661019, 'train_runtime': '2358', 'train_tokens_per_second': '1977'} +{'loss': '0.3779', 'grad_norm': '1.782', 'learning_rate': '2.866e-05', 'epoch': '0.05736', 'num_input_tokens_seen': 4663066, 'train_runtime': '2359', 'train_tokens_per_second': '1977'} +{'loss': '0.4202', 'grad_norm': '1.774', 'learning_rate': '2.868e-05', 'epoch': '0.05738', 'num_input_tokens_seen': 4665113, 'train_runtime': '2360', 'train_tokens_per_second': '1977'} +{'loss': '1.221', 'grad_norm': '3.184', 'learning_rate': '2.869e-05', 'epoch': '0.05741', 'num_input_tokens_seen': 4667160, 'train_runtime': '2361', 'train_tokens_per_second': '1977'} +{'loss': '3.139', 'grad_norm': '4.256', 'learning_rate': '2.87e-05', 'epoch': '0.05743', 'num_input_tokens_seen': 4669207, 'train_runtime': '2362', 'train_tokens_per_second': '1977'} +{'loss': '2.036', 'grad_norm': '3.819', 'learning_rate': '2.871e-05', 'epoch': '0.05746', 'num_input_tokens_seen': 4671254, 'train_runtime': '2363', 'train_tokens_per_second': '1977'} +{'loss': '2.757', 'grad_norm': '3.303', 'learning_rate': '2.873e-05', 'epoch': '0.05748', 'num_input_tokens_seen': 4673301, 'train_runtime': '2364', 'train_tokens_per_second': '1977'} +{'loss': '0.7983', 'grad_norm': '2.185', 'learning_rate': '2.874e-05', 'epoch': '0.05751', 'num_input_tokens_seen': 4675348, 'train_runtime': '2365', 'train_tokens_per_second': '1977'} +{'loss': '1.217', 'grad_norm': '3.309', 'learning_rate': '2.875e-05', 'epoch': '0.05753', 'num_input_tokens_seen': 4677395, 'train_runtime': '2366', 'train_tokens_per_second': '1977'} +{'loss': '0.6919', 'grad_norm': '1.873', 'learning_rate': '2.876e-05', 'epoch': '0.05756', 'num_input_tokens_seen': 4679442, 'train_runtime': '2367', 'train_tokens_per_second': '1977'} +{'loss': '1.285', 'grad_norm': '2.607', 'learning_rate': '2.878e-05', 'epoch': '0.05758', 'num_input_tokens_seen': 4681489, 'train_runtime': '2368', 'train_tokens_per_second': '1977'} +{'loss': '1.769', 'grad_norm': '3.509', 'learning_rate': '2.879e-05', 'epoch': '0.05761', 'num_input_tokens_seen': 4683536, 'train_runtime': '2369', 'train_tokens_per_second': '1977'} +{'loss': '2.642', 'grad_norm': '3.237', 'learning_rate': '2.88e-05', 'epoch': '0.05763', 'num_input_tokens_seen': 4685583, 'train_runtime': '2370', 'train_tokens_per_second': '1977'} +{'loss': '0.5927', 'grad_norm': '2.044', 'learning_rate': '2.881e-05', 'epoch': '0.05766', 'num_input_tokens_seen': 4687630, 'train_runtime': '2371', 'train_tokens_per_second': '1977'} +{'loss': '1.32', 'grad_norm': '2.314', 'learning_rate': '2.883e-05', 'epoch': '0.05768', 'num_input_tokens_seen': 4689677, 'train_runtime': '2372', 'train_tokens_per_second': '1977'} +{'loss': '1.021', 'grad_norm': '3.155', 'learning_rate': '2.884e-05', 'epoch': '0.05771', 'num_input_tokens_seen': 4691724, 'train_runtime': '2373', 'train_tokens_per_second': '1977'} +{'loss': '1.475', 'grad_norm': '3.071', 'learning_rate': '2.885e-05', 'epoch': '0.05773', 'num_input_tokens_seen': 4693771, 'train_runtime': '2374', 'train_tokens_per_second': '1977'} +{'loss': '2.013', 'grad_norm': '3.611', 'learning_rate': '2.886e-05', 'epoch': '0.05776', 'num_input_tokens_seen': 4695818, 'train_runtime': '2375', 'train_tokens_per_second': '1977'} +{'loss': '0.48', 'grad_norm': '1.826', 'learning_rate': '2.888e-05', 'epoch': '0.05778', 'num_input_tokens_seen': 4697865, 'train_runtime': '2377', 'train_tokens_per_second': '1977'} +{'loss': '0.4296', 'grad_norm': '1.757', 'learning_rate': '2.889e-05', 'epoch': '0.05781', 'num_input_tokens_seen': 4699912, 'train_runtime': '2378', 'train_tokens_per_second': '1977'} +{'loss': '1.418', 'grad_norm': '3.72', 'learning_rate': '2.89e-05', 'epoch': '0.05783', 'num_input_tokens_seen': 4701959, 'train_runtime': '2379', 'train_tokens_per_second': '1977'} +{'loss': '0.6986', 'grad_norm': '2.079', 'learning_rate': '2.891e-05', 'epoch': '0.05786', 'num_input_tokens_seen': 4704006, 'train_runtime': '2380', 'train_tokens_per_second': '1977'} +{'loss': '0.9325', 'grad_norm': '3.112', 'learning_rate': '2.893e-05', 'epoch': '0.05788', 'num_input_tokens_seen': 4706053, 'train_runtime': '2381', 'train_tokens_per_second': '1977'} +{'loss': '1.26', 'grad_norm': '2.87', 'learning_rate': '2.894e-05', 'epoch': '0.05791', 'num_input_tokens_seen': 4708100, 'train_runtime': '2382', 'train_tokens_per_second': '1977'} +{'loss': '2.196', 'grad_norm': '3.173', 'learning_rate': '2.895e-05', 'epoch': '0.05793', 'num_input_tokens_seen': 4710147, 'train_runtime': '2383', 'train_tokens_per_second': '1977'} +{'loss': '1.262', 'grad_norm': '2.6', 'learning_rate': '2.897e-05', 'epoch': '0.05796', 'num_input_tokens_seen': 4712194, 'train_runtime': '2384', 'train_tokens_per_second': '1977'} +{'loss': '1.434', 'grad_norm': '2.896', 'learning_rate': '2.898e-05', 'epoch': '0.05799', 'num_input_tokens_seen': 4714241, 'train_runtime': '2385', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '2.779', 'learning_rate': '2.899e-05', 'epoch': '0.05801', 'num_input_tokens_seen': 4716288, 'train_runtime': '2386', 'train_tokens_per_second': '1977'} +{'loss': '0.4688', 'grad_norm': '1.929', 'learning_rate': '2.9e-05', 'epoch': '0.05804', 'num_input_tokens_seen': 4718335, 'train_runtime': '2387', 'train_tokens_per_second': '1977'} +{'loss': '0.4102', 'grad_norm': '1.638', 'learning_rate': '2.902e-05', 'epoch': '0.05806', 'num_input_tokens_seen': 4720382, 'train_runtime': '2388', 'train_tokens_per_second': '1977'} +{'loss': '0.6797', 'grad_norm': '2.042', 'learning_rate': '2.903e-05', 'epoch': '0.05809', 'num_input_tokens_seen': 4722429, 'train_runtime': '2389', 'train_tokens_per_second': '1977'} +{'loss': '0.9009', 'grad_norm': '2.256', 'learning_rate': '2.904e-05', 'epoch': '0.05811', 'num_input_tokens_seen': 4724476, 'train_runtime': '2390', 'train_tokens_per_second': '1977'} +{'loss': '1.375', 'grad_norm': '3.072', 'learning_rate': '2.905e-05', 'epoch': '0.05814', 'num_input_tokens_seen': 4726523, 'train_runtime': '2391', 'train_tokens_per_second': '1977'} +{'loss': '1.857', 'grad_norm': '3.309', 'learning_rate': '2.907e-05', 'epoch': '0.05816', 'num_input_tokens_seen': 4728570, 'train_runtime': '2392', 'train_tokens_per_second': '1977'} +{'loss': '1.231', 'grad_norm': '2.732', 'learning_rate': '2.908e-05', 'epoch': '0.05819', 'num_input_tokens_seen': 4730617, 'train_runtime': '2393', 'train_tokens_per_second': '1977'} +{'loss': '0.5224', 'grad_norm': '1.895', 'learning_rate': '2.909e-05', 'epoch': '0.05821', 'num_input_tokens_seen': 4732664, 'train_runtime': '2394', 'train_tokens_per_second': '1977'} +{'loss': '0.7798', 'grad_norm': '2.007', 'learning_rate': '2.91e-05', 'epoch': '0.05824', 'num_input_tokens_seen': 4734711, 'train_runtime': '2395', 'train_tokens_per_second': '1977'} +{'loss': '0.5316', 'grad_norm': '2.043', 'learning_rate': '2.912e-05', 'epoch': '0.05826', 'num_input_tokens_seen': 4736758, 'train_runtime': '2396', 'train_tokens_per_second': '1977'} +{'loss': '0.9858', 'grad_norm': '2.924', 'learning_rate': '2.913e-05', 'epoch': '0.05829', 'num_input_tokens_seen': 4738805, 'train_runtime': '2397', 'train_tokens_per_second': '1977'} +{'loss': '0.6895', 'grad_norm': '3.206', 'learning_rate': '2.914e-05', 'epoch': '0.05831', 'num_input_tokens_seen': 4740852, 'train_runtime': '2398', 'train_tokens_per_second': '1977'} +{'loss': '3.068', 'grad_norm': '3.154', 'learning_rate': '2.915e-05', 'epoch': '0.05834', 'num_input_tokens_seen': 4742899, 'train_runtime': '2399', 'train_tokens_per_second': '1977'} +{'loss': '2.205', 'grad_norm': '3.081', 'learning_rate': '2.917e-05', 'epoch': '0.05836', 'num_input_tokens_seen': 4744946, 'train_runtime': '2400', 'train_tokens_per_second': '1977'} +{'loss': '2.642', 'grad_norm': '3.132', 'learning_rate': '2.918e-05', 'epoch': '0.05839', 'num_input_tokens_seen': 4746993, 'train_runtime': '2401', 'train_tokens_per_second': '1977'} +{'loss': '0.3853', 'grad_norm': '1.743', 'learning_rate': '2.919e-05', 'epoch': '0.05841', 'num_input_tokens_seen': 4749040, 'train_runtime': '2402', 'train_tokens_per_second': '1977'} +{'loss': '0.7914', 'grad_norm': '2.174', 'learning_rate': '2.92e-05', 'epoch': '0.05844', 'num_input_tokens_seen': 4751087, 'train_runtime': '2403', 'train_tokens_per_second': '1977'} +{'loss': '1.419', 'grad_norm': '2.976', 'learning_rate': '2.922e-05', 'epoch': '0.05846', 'num_input_tokens_seen': 4753134, 'train_runtime': '2404', 'train_tokens_per_second': '1977'} +{'loss': '0.8815', 'grad_norm': '1.908', 'learning_rate': '2.923e-05', 'epoch': '0.05849', 'num_input_tokens_seen': 4755181, 'train_runtime': '2405', 'train_tokens_per_second': '1977'} +{'loss': '0.4519', 'grad_norm': '1.649', 'learning_rate': '2.924e-05', 'epoch': '0.05851', 'num_input_tokens_seen': 4757228, 'train_runtime': '2406', 'train_tokens_per_second': '1977'} +{'loss': '0.4125', 'grad_norm': '1.615', 'learning_rate': '2.925e-05', 'epoch': '0.05854', 'num_input_tokens_seen': 4759275, 'train_runtime': '2408', 'train_tokens_per_second': '1977'} +{'loss': '1.063', 'grad_norm': '2.805', 'learning_rate': '2.927e-05', 'epoch': '0.05856', 'num_input_tokens_seen': 4761322, 'train_runtime': '2409', 'train_tokens_per_second': '1977'} +{'loss': '1.402', 'grad_norm': '2.8', 'learning_rate': '2.928e-05', 'epoch': '0.05859', 'num_input_tokens_seen': 4763369, 'train_runtime': '2410', 'train_tokens_per_second': '1977'} +{'loss': '0.4433', 'grad_norm': '1.768', 'learning_rate': '2.929e-05', 'epoch': '0.05861', 'num_input_tokens_seen': 4765416, 'train_runtime': '2411', 'train_tokens_per_second': '1977'} +{'loss': '3.153', 'grad_norm': '3.732', 'learning_rate': '2.931e-05', 'epoch': '0.05864', 'num_input_tokens_seen': 4767463, 'train_runtime': '2412', 'train_tokens_per_second': '1977'} +{'loss': '0.8159', 'grad_norm': '2.089', 'learning_rate': '2.932e-05', 'epoch': '0.05867', 'num_input_tokens_seen': 4769510, 'train_runtime': '2413', 'train_tokens_per_second': '1977'} +{'loss': '1.508', 'grad_norm': '3.104', 'learning_rate': '2.933e-05', 'epoch': '0.05869', 'num_input_tokens_seen': 4771557, 'train_runtime': '2414', 'train_tokens_per_second': '1977'} +{'loss': '0.8277', 'grad_norm': '2.257', 'learning_rate': '2.934e-05', 'epoch': '0.05872', 'num_input_tokens_seen': 4773604, 'train_runtime': '2415', 'train_tokens_per_second': '1977'} +{'loss': '0.86', 'grad_norm': '2.539', 'learning_rate': '2.936e-05', 'epoch': '0.05874', 'num_input_tokens_seen': 4775651, 'train_runtime': '2416', 'train_tokens_per_second': '1977'} +{'loss': '0.8038', 'grad_norm': '2.479', 'learning_rate': '2.937e-05', 'epoch': '0.05877', 'num_input_tokens_seen': 4777698, 'train_runtime': '2417', 'train_tokens_per_second': '1977'} +{'loss': '0.9994', 'grad_norm': '2.426', 'learning_rate': '2.938e-05', 'epoch': '0.05879', 'num_input_tokens_seen': 4779745, 'train_runtime': '2418', 'train_tokens_per_second': '1977'} +{'loss': '0.6132', 'grad_norm': '2.238', 'learning_rate': '2.939e-05', 'epoch': '0.05882', 'num_input_tokens_seen': 4781792, 'train_runtime': '2419', 'train_tokens_per_second': '1977'} +{'loss': '1.619', 'grad_norm': '2.965', 'learning_rate': '2.941e-05', 'epoch': '0.05884', 'num_input_tokens_seen': 4783839, 'train_runtime': '2420', 'train_tokens_per_second': '1977'} +{'loss': '0.4158', 'grad_norm': '1.858', 'learning_rate': '2.942e-05', 'epoch': '0.05887', 'num_input_tokens_seen': 4785886, 'train_runtime': '2421', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '2.477', 'learning_rate': '2.943e-05', 'epoch': '0.05889', 'num_input_tokens_seen': 4787933, 'train_runtime': '2422', 'train_tokens_per_second': '1977'} +{'loss': '1.105', 'grad_norm': '2.279', 'learning_rate': '2.944e-05', 'epoch': '0.05892', 'num_input_tokens_seen': 4789980, 'train_runtime': '2423', 'train_tokens_per_second': '1977'} +{'loss': '1.111', 'grad_norm': '3.088', 'learning_rate': '2.946e-05', 'epoch': '0.05894', 'num_input_tokens_seen': 4792027, 'train_runtime': '2424', 'train_tokens_per_second': '1977'} +{'loss': '1.033', 'grad_norm': '2.833', 'learning_rate': '2.947e-05', 'epoch': '0.05897', 'num_input_tokens_seen': 4794074, 'train_runtime': '2425', 'train_tokens_per_second': '1977'} +{'loss': '0.4009', 'grad_norm': '1.638', 'learning_rate': '2.948e-05', 'epoch': '0.05899', 'num_input_tokens_seen': 4796121, 'train_runtime': '2426', 'train_tokens_per_second': '1977'} +{'loss': '0.5431', 'grad_norm': '2.15', 'learning_rate': '2.949e-05', 'epoch': '0.05902', 'num_input_tokens_seen': 4798168, 'train_runtime': '2427', 'train_tokens_per_second': '1977'} +{'loss': '1.92', 'grad_norm': '4.304', 'learning_rate': '2.951e-05', 'epoch': '0.05904', 'num_input_tokens_seen': 4800215, 'train_runtime': '2428', 'train_tokens_per_second': '1977'} +{'loss': '0.8866', 'grad_norm': '2.739', 'learning_rate': '2.952e-05', 'epoch': '0.05907', 'num_input_tokens_seen': 4802262, 'train_runtime': '2429', 'train_tokens_per_second': '1977'} +{'loss': '1.201', 'grad_norm': '2.853', 'learning_rate': '2.953e-05', 'epoch': '0.05909', 'num_input_tokens_seen': 4804309, 'train_runtime': '2430', 'train_tokens_per_second': '1977'} +{'loss': '2.379', 'grad_norm': '2.971', 'learning_rate': '2.954e-05', 'epoch': '0.05912', 'num_input_tokens_seen': 4806356, 'train_runtime': '2431', 'train_tokens_per_second': '1977'} +{'loss': '0.4771', 'grad_norm': '1.503', 'learning_rate': '2.956e-05', 'epoch': '0.05914', 'num_input_tokens_seen': 4808403, 'train_runtime': '2432', 'train_tokens_per_second': '1977'} +{'loss': '1.076', 'grad_norm': '2.358', 'learning_rate': '2.957e-05', 'epoch': '0.05917', 'num_input_tokens_seen': 4810450, 'train_runtime': '2433', 'train_tokens_per_second': '1977'} +{'loss': '0.5102', 'grad_norm': '1.662', 'learning_rate': '2.958e-05', 'epoch': '0.05919', 'num_input_tokens_seen': 4812497, 'train_runtime': '2434', 'train_tokens_per_second': '1977'} +{'loss': '0.792', 'grad_norm': '2.305', 'learning_rate': '2.959e-05', 'epoch': '0.05922', 'num_input_tokens_seen': 4814544, 'train_runtime': '2435', 'train_tokens_per_second': '1977'} +{'loss': '0.8357', 'grad_norm': '2.42', 'learning_rate': '2.961e-05', 'epoch': '0.05924', 'num_input_tokens_seen': 4816591, 'train_runtime': '2436', 'train_tokens_per_second': '1977'} +{'loss': '1.3', 'grad_norm': '2.929', 'learning_rate': '2.962e-05', 'epoch': '0.05927', 'num_input_tokens_seen': 4818638, 'train_runtime': '2437', 'train_tokens_per_second': '1977'} +{'loss': '2.313', 'grad_norm': '3.878', 'learning_rate': '2.963e-05', 'epoch': '0.05929', 'num_input_tokens_seen': 4820685, 'train_runtime': '2439', 'train_tokens_per_second': '1977'} +{'loss': '1.391', 'grad_norm': '2.715', 'learning_rate': '2.965e-05', 'epoch': '0.05932', 'num_input_tokens_seen': 4822732, 'train_runtime': '2440', 'train_tokens_per_second': '1977'} +{'loss': '2.445', 'grad_norm': '3.12', 'learning_rate': '2.966e-05', 'epoch': '0.05934', 'num_input_tokens_seen': 4824779, 'train_runtime': '2441', 'train_tokens_per_second': '1977'} +{'loss': '0.7738', 'grad_norm': '2.24', 'learning_rate': '2.967e-05', 'epoch': '0.05937', 'num_input_tokens_seen': 4826826, 'train_runtime': '2442', 'train_tokens_per_second': '1977'} +{'loss': '0.4707', 'grad_norm': '1.706', 'learning_rate': '2.968e-05', 'epoch': '0.0594', 'num_input_tokens_seen': 4828873, 'train_runtime': '2443', 'train_tokens_per_second': '1977'} +{'loss': '0.9572', 'grad_norm': '2.416', 'learning_rate': '2.97e-05', 'epoch': '0.05942', 'num_input_tokens_seen': 4830920, 'train_runtime': '2444', 'train_tokens_per_second': '1977'} +{'loss': '0.8986', 'grad_norm': '2.201', 'learning_rate': '2.971e-05', 'epoch': '0.05945', 'num_input_tokens_seen': 4832967, 'train_runtime': '2445', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '2.871', 'learning_rate': '2.972e-05', 'epoch': '0.05947', 'num_input_tokens_seen': 4835014, 'train_runtime': '2446', 'train_tokens_per_second': '1977'} +{'loss': '0.8814', 'grad_norm': '2.559', 'learning_rate': '2.973e-05', 'epoch': '0.0595', 'num_input_tokens_seen': 4837061, 'train_runtime': '2447', 'train_tokens_per_second': '1977'} +{'loss': '1.161', 'grad_norm': '2.955', 'learning_rate': '2.975e-05', 'epoch': '0.05952', 'num_input_tokens_seen': 4839108, 'train_runtime': '2448', 'train_tokens_per_second': '1977'} +{'loss': '0.6506', 'grad_norm': '1.801', 'learning_rate': '2.976e-05', 'epoch': '0.05955', 'num_input_tokens_seen': 4841155, 'train_runtime': '2449', 'train_tokens_per_second': '1977'} +{'loss': '0.7271', 'grad_norm': '2.075', 'learning_rate': '2.977e-05', 'epoch': '0.05957', 'num_input_tokens_seen': 4843202, 'train_runtime': '2450', 'train_tokens_per_second': '1977'} +{'loss': '0.4921', 'grad_norm': '2.17', 'learning_rate': '2.978e-05', 'epoch': '0.0596', 'num_input_tokens_seen': 4845249, 'train_runtime': '2451', 'train_tokens_per_second': '1977'} +{'loss': '2.041', 'grad_norm': '3.511', 'learning_rate': '2.98e-05', 'epoch': '0.05962', 'num_input_tokens_seen': 4847296, 'train_runtime': '2452', 'train_tokens_per_second': '1977'} +{'loss': '0.8911', 'grad_norm': '2.665', 'learning_rate': '2.981e-05', 'epoch': '0.05965', 'num_input_tokens_seen': 4849343, 'train_runtime': '2453', 'train_tokens_per_second': '1977'} +{'loss': '0.446', 'grad_norm': '1.644', 'learning_rate': '2.982e-05', 'epoch': '0.05967', 'num_input_tokens_seen': 4851390, 'train_runtime': '2454', 'train_tokens_per_second': '1977'} +{'loss': '0.4172', 'grad_norm': '2.005', 'learning_rate': '2.983e-05', 'epoch': '0.0597', 'num_input_tokens_seen': 4853437, 'train_runtime': '2455', 'train_tokens_per_second': '1977'} +{'loss': '2.26', 'grad_norm': '3.523', 'learning_rate': '2.985e-05', 'epoch': '0.05972', 'num_input_tokens_seen': 4855484, 'train_runtime': '2456', 'train_tokens_per_second': '1977'} +{'loss': '0.7038', 'grad_norm': '2.02', 'learning_rate': '2.986e-05', 'epoch': '0.05975', 'num_input_tokens_seen': 4857531, 'train_runtime': '2457', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '2.599', 'learning_rate': '2.987e-05', 'epoch': '0.05977', 'num_input_tokens_seen': 4859578, 'train_runtime': '2458', 'train_tokens_per_second': '1977'} +{'loss': '0.6195', 'grad_norm': '2.119', 'learning_rate': '2.988e-05', 'epoch': '0.0598', 'num_input_tokens_seen': 4861625, 'train_runtime': '2459', 'train_tokens_per_second': '1977'} +{'loss': '2.321', 'grad_norm': '3.08', 'learning_rate': '2.99e-05', 'epoch': '0.05982', 'num_input_tokens_seen': 4863672, 'train_runtime': '2460', 'train_tokens_per_second': '1977'} +{'loss': '0.5354', 'grad_norm': '2.187', 'learning_rate': '2.991e-05', 'epoch': '0.05985', 'num_input_tokens_seen': 4865719, 'train_runtime': '2461', 'train_tokens_per_second': '1977'} +{'loss': '0.8993', 'grad_norm': '2.052', 'learning_rate': '2.992e-05', 'epoch': '0.05987', 'num_input_tokens_seen': 4867766, 'train_runtime': '2462', 'train_tokens_per_second': '1977'} +{'loss': '1.693', 'grad_norm': '3.429', 'learning_rate': '2.993e-05', 'epoch': '0.0599', 'num_input_tokens_seen': 4869813, 'train_runtime': '2463', 'train_tokens_per_second': '1977'} +{'loss': '0.4461', 'grad_norm': '1.652', 'learning_rate': '2.995e-05', 'epoch': '0.05992', 'num_input_tokens_seen': 4871860, 'train_runtime': '2464', 'train_tokens_per_second': '1977'} +{'loss': '0.359', 'grad_norm': '1.514', 'learning_rate': '2.996e-05', 'epoch': '0.05995', 'num_input_tokens_seen': 4873907, 'train_runtime': '2465', 'train_tokens_per_second': '1977'} +{'loss': '0.6929', 'grad_norm': '1.992', 'learning_rate': '2.997e-05', 'epoch': '0.05997', 'num_input_tokens_seen': 4875954, 'train_runtime': '2466', 'train_tokens_per_second': '1977'} +{'loss': '0.7811', 'grad_norm': '2.863', 'learning_rate': '2.998e-05', 'epoch': '0.06', 'num_input_tokens_seen': 4878001, 'train_runtime': '2467', 'train_tokens_per_second': '1977'} +{'loss': '1.168', 'grad_norm': '3.424', 'learning_rate': '3e-05', 'epoch': '0.06002', 'num_input_tokens_seen': 4880048, 'train_runtime': '2469', 'train_tokens_per_second': '1977'} +{'loss': '1.761', 'grad_norm': '3.416', 'learning_rate': '3.001e-05', 'epoch': '0.06005', 'num_input_tokens_seen': 4882095, 'train_runtime': '2470', 'train_tokens_per_second': '1977'} +{'loss': '0.8209', 'grad_norm': '2.337', 'learning_rate': '3.002e-05', 'epoch': '0.06008', 'num_input_tokens_seen': 4884142, 'train_runtime': '2471', 'train_tokens_per_second': '1977'} +{'loss': '1.496', 'grad_norm': '2.776', 'learning_rate': '3.004e-05', 'epoch': '0.0601', 'num_input_tokens_seen': 4886189, 'train_runtime': '2472', 'train_tokens_per_second': '1977'} +{'loss': '0.9162', 'grad_norm': '2.157', 'learning_rate': '3.005e-05', 'epoch': '0.06013', 'num_input_tokens_seen': 4888236, 'train_runtime': '2473', 'train_tokens_per_second': '1977'} +{'loss': '1.229', 'grad_norm': '3.046', 'learning_rate': '3.006e-05', 'epoch': '0.06015', 'num_input_tokens_seen': 4890283, 'train_runtime': '2474', 'train_tokens_per_second': '1977'} +{'loss': '0.805', 'grad_norm': '2.275', 'learning_rate': '3.007e-05', 'epoch': '0.06018', 'num_input_tokens_seen': 4892330, 'train_runtime': '2475', 'train_tokens_per_second': '1977'} +{'loss': '1.701', 'grad_norm': '3.252', 'learning_rate': '3.009e-05', 'epoch': '0.0602', 'num_input_tokens_seen': 4894377, 'train_runtime': '2476', 'train_tokens_per_second': '1977'} +{'loss': '1.496', 'grad_norm': '2.938', 'learning_rate': '3.01e-05', 'epoch': '0.06023', 'num_input_tokens_seen': 4896424, 'train_runtime': '2477', 'train_tokens_per_second': '1977'} +{'loss': '0.936', 'grad_norm': '2.941', 'learning_rate': '3.011e-05', 'epoch': '0.06025', 'num_input_tokens_seen': 4898471, 'train_runtime': '2478', 'train_tokens_per_second': '1977'} +{'loss': '0.6776', 'grad_norm': '2.179', 'learning_rate': '3.012e-05', 'epoch': '0.06028', 'num_input_tokens_seen': 4900518, 'train_runtime': '2479', 'train_tokens_per_second': '1977'} +{'loss': '0.7257', 'grad_norm': '2.002', 'learning_rate': '3.014e-05', 'epoch': '0.0603', 'num_input_tokens_seen': 4902565, 'train_runtime': '2480', 'train_tokens_per_second': '1977'} +{'loss': '1.254', 'grad_norm': '3.194', 'learning_rate': '3.015e-05', 'epoch': '0.06033', 'num_input_tokens_seen': 4904612, 'train_runtime': '2481', 'train_tokens_per_second': '1977'} +{'loss': '1.374', 'grad_norm': '2.916', 'learning_rate': '3.016e-05', 'epoch': '0.06035', 'num_input_tokens_seen': 4906659, 'train_runtime': '2482', 'train_tokens_per_second': '1977'} +{'loss': '0.4419', 'grad_norm': '1.926', 'learning_rate': '3.017e-05', 'epoch': '0.06038', 'num_input_tokens_seen': 4908706, 'train_runtime': '2483', 'train_tokens_per_second': '1977'} +{'loss': '0.4879', 'grad_norm': '2.014', 'learning_rate': '3.019e-05', 'epoch': '0.0604', 'num_input_tokens_seen': 4910753, 'train_runtime': '2484', 'train_tokens_per_second': '1977'} +{'loss': '0.5319', 'grad_norm': '2.092', 'learning_rate': '3.02e-05', 'epoch': '0.06043', 'num_input_tokens_seen': 4912800, 'train_runtime': '2485', 'train_tokens_per_second': '1977'} +{'loss': '1.009', 'grad_norm': '2.373', 'learning_rate': '3.021e-05', 'epoch': '0.06045', 'num_input_tokens_seen': 4914847, 'train_runtime': '2486', 'train_tokens_per_second': '1977'} +{'loss': '1.903', 'grad_norm': '3.632', 'learning_rate': '3.022e-05', 'epoch': '0.06048', 'num_input_tokens_seen': 4916894, 'train_runtime': '2487', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '2.958', 'learning_rate': '3.024e-05', 'epoch': '0.0605', 'num_input_tokens_seen': 4918941, 'train_runtime': '2488', 'train_tokens_per_second': '1977'} +{'loss': '1.343', 'grad_norm': '3.111', 'learning_rate': '3.025e-05', 'epoch': '0.06053', 'num_input_tokens_seen': 4920988, 'train_runtime': '2489', 'train_tokens_per_second': '1977'} +{'loss': '1.76', 'grad_norm': '3.58', 'learning_rate': '3.026e-05', 'epoch': '0.06055', 'num_input_tokens_seen': 4923035, 'train_runtime': '2490', 'train_tokens_per_second': '1977'} +{'loss': '2.816', 'grad_norm': '3.191', 'learning_rate': '3.027e-05', 'epoch': '0.06058', 'num_input_tokens_seen': 4925082, 'train_runtime': '2491', 'train_tokens_per_second': '1977'} +{'loss': '2.028', 'grad_norm': '3.763', 'learning_rate': '3.029e-05', 'epoch': '0.0606', 'num_input_tokens_seen': 4927129, 'train_runtime': '2492', 'train_tokens_per_second': '1977'} +{'loss': '2.078', 'grad_norm': '3.643', 'learning_rate': '3.03e-05', 'epoch': '0.06063', 'num_input_tokens_seen': 4929176, 'train_runtime': '2493', 'train_tokens_per_second': '1977'} +{'loss': '0.7944', 'grad_norm': '2.068', 'learning_rate': '3.031e-05', 'epoch': '0.06065', 'num_input_tokens_seen': 4931223, 'train_runtime': '2494', 'train_tokens_per_second': '1977'} +{'loss': '0.9707', 'grad_norm': '2.473', 'learning_rate': '3.032e-05', 'epoch': '0.06068', 'num_input_tokens_seen': 4933270, 'train_runtime': '2495', 'train_tokens_per_second': '1977'} +{'loss': '0.4914', 'grad_norm': '1.758', 'learning_rate': '3.034e-05', 'epoch': '0.0607', 'num_input_tokens_seen': 4935317, 'train_runtime': '2496', 'train_tokens_per_second': '1977'} +{'loss': '1.098', 'grad_norm': '2.694', 'learning_rate': '3.035e-05', 'epoch': '0.06073', 'num_input_tokens_seen': 4937364, 'train_runtime': '2497', 'train_tokens_per_second': '1977'} +{'loss': '0.8479', 'grad_norm': '2.649', 'learning_rate': '3.036e-05', 'epoch': '0.06075', 'num_input_tokens_seen': 4939411, 'train_runtime': '2498', 'train_tokens_per_second': '1977'} +{'loss': '0.8631', 'grad_norm': '2.703', 'learning_rate': '3.038e-05', 'epoch': '0.06078', 'num_input_tokens_seen': 4941458, 'train_runtime': '2500', 'train_tokens_per_second': '1977'} +{'loss': '1.522', 'grad_norm': '2.991', 'learning_rate': '3.039e-05', 'epoch': '0.06081', 'num_input_tokens_seen': 4943505, 'train_runtime': '2501', 'train_tokens_per_second': '1977'} +{'loss': '0.9402', 'grad_norm': '2.276', 'learning_rate': '3.04e-05', 'epoch': '0.06083', 'num_input_tokens_seen': 4945552, 'train_runtime': '2502', 'train_tokens_per_second': '1977'} +{'loss': '1.617', 'grad_norm': '3.227', 'learning_rate': '3.041e-05', 'epoch': '0.06086', 'num_input_tokens_seen': 4947599, 'train_runtime': '2503', 'train_tokens_per_second': '1977'} +{'loss': '0.48', 'grad_norm': '1.441', 'learning_rate': '3.043e-05', 'epoch': '0.06088', 'num_input_tokens_seen': 4949646, 'train_runtime': '2504', 'train_tokens_per_second': '1977'} +{'loss': '1.103', 'grad_norm': '3.253', 'learning_rate': '3.044e-05', 'epoch': '0.06091', 'num_input_tokens_seen': 4951693, 'train_runtime': '2505', 'train_tokens_per_second': '1977'} +{'loss': '0.6059', 'grad_norm': '2.16', 'learning_rate': '3.045e-05', 'epoch': '0.06093', 'num_input_tokens_seen': 4953740, 'train_runtime': '2506', 'train_tokens_per_second': '1977'} +{'loss': '1.528', 'grad_norm': '3.036', 'learning_rate': '3.046e-05', 'epoch': '0.06096', 'num_input_tokens_seen': 4955787, 'train_runtime': '2507', 'train_tokens_per_second': '1977'} +{'loss': '1.796', 'grad_norm': '3.681', 'learning_rate': '3.048e-05', 'epoch': '0.06098', 'num_input_tokens_seen': 4957834, 'train_runtime': '2508', 'train_tokens_per_second': '1977'} +{'loss': '0.6116', 'grad_norm': '2.545', 'learning_rate': '3.049e-05', 'epoch': '0.06101', 'num_input_tokens_seen': 4959881, 'train_runtime': '2509', 'train_tokens_per_second': '1977'} +{'loss': '0.8303', 'grad_norm': '1.99', 'learning_rate': '3.05e-05', 'epoch': '0.06103', 'num_input_tokens_seen': 4961928, 'train_runtime': '2510', 'train_tokens_per_second': '1977'} +{'loss': '1.145', 'grad_norm': '3.111', 'learning_rate': '3.051e-05', 'epoch': '0.06106', 'num_input_tokens_seen': 4963975, 'train_runtime': '2511', 'train_tokens_per_second': '1977'} +{'loss': '1.213', 'grad_norm': '2.944', 'learning_rate': '3.053e-05', 'epoch': '0.06108', 'num_input_tokens_seen': 4966022, 'train_runtime': '2512', 'train_tokens_per_second': '1977'} +{'loss': '1.136', 'grad_norm': '3.006', 'learning_rate': '3.054e-05', 'epoch': '0.06111', 'num_input_tokens_seen': 4968069, 'train_runtime': '2513', 'train_tokens_per_second': '1977'} +{'loss': '0.468', 'grad_norm': '1.764', 'learning_rate': '3.055e-05', 'epoch': '0.06113', 'num_input_tokens_seen': 4970116, 'train_runtime': '2514', 'train_tokens_per_second': '1977'} +{'loss': '1.149', 'grad_norm': '2.238', 'learning_rate': '3.056e-05', 'epoch': '0.06116', 'num_input_tokens_seen': 4972163, 'train_runtime': '2515', 'train_tokens_per_second': '1977'} +{'loss': '1.5', 'grad_norm': '2.795', 'learning_rate': '3.058e-05', 'epoch': '0.06118', 'num_input_tokens_seen': 4974210, 'train_runtime': '2516', 'train_tokens_per_second': '1977'} +{'loss': '1.101', 'grad_norm': '2.409', 'learning_rate': '3.059e-05', 'epoch': '0.06121', 'num_input_tokens_seen': 4976257, 'train_runtime': '2517', 'train_tokens_per_second': '1977'} +{'loss': '0.7775', 'grad_norm': '2.035', 'learning_rate': '3.06e-05', 'epoch': '0.06123', 'num_input_tokens_seen': 4978304, 'train_runtime': '2518', 'train_tokens_per_second': '1977'} +{'loss': '0.43', 'grad_norm': '1.667', 'learning_rate': '3.061e-05', 'epoch': '0.06126', 'num_input_tokens_seen': 4980351, 'train_runtime': '2519', 'train_tokens_per_second': '1977'} +{'loss': '0.5945', 'grad_norm': '2.629', 'learning_rate': '3.063e-05', 'epoch': '0.06128', 'num_input_tokens_seen': 4982398, 'train_runtime': '2520', 'train_tokens_per_second': '1977'} +{'loss': '0.5922', 'grad_norm': '1.957', 'learning_rate': '3.064e-05', 'epoch': '0.06131', 'num_input_tokens_seen': 4984445, 'train_runtime': '2521', 'train_tokens_per_second': '1977'} +{'loss': '1.833', 'grad_norm': '4.138', 'learning_rate': '3.065e-05', 'epoch': '0.06133', 'num_input_tokens_seen': 4986492, 'train_runtime': '2522', 'train_tokens_per_second': '1977'} +{'loss': '0.7152', 'grad_norm': '2.184', 'learning_rate': '3.066e-05', 'epoch': '0.06136', 'num_input_tokens_seen': 4988539, 'train_runtime': '2523', 'train_tokens_per_second': '1977'} +{'loss': '1.476', 'grad_norm': '2.961', 'learning_rate': '3.068e-05', 'epoch': '0.06138', 'num_input_tokens_seen': 4990586, 'train_runtime': '2524', 'train_tokens_per_second': '1977'} +{'loss': '0.9663', 'grad_norm': '2.646', 'learning_rate': '3.069e-05', 'epoch': '0.06141', 'num_input_tokens_seen': 4992633, 'train_runtime': '2525', 'train_tokens_per_second': '1977'} +{'loss': '2.727', 'grad_norm': '4.113', 'learning_rate': '3.07e-05', 'epoch': '0.06143', 'num_input_tokens_seen': 4994680, 'train_runtime': '2526', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '2.574', 'learning_rate': '3.072e-05', 'epoch': '0.06146', 'num_input_tokens_seen': 4996727, 'train_runtime': '2527', 'train_tokens_per_second': '1977'} +{'loss': '0.8031', 'grad_norm': '2.483', 'learning_rate': '3.073e-05', 'epoch': '0.06149', 'num_input_tokens_seen': 4998774, 'train_runtime': '2528', 'train_tokens_per_second': '1977'} +{'loss': '0.7031', 'grad_norm': '1.605', 'learning_rate': '3.074e-05', 'epoch': '0.06151', 'num_input_tokens_seen': 5000821, 'train_runtime': '2529', 'train_tokens_per_second': '1977'} +{'loss': '0.5054', 'grad_norm': '1.547', 'learning_rate': '3.075e-05', 'epoch': '0.06154', 'num_input_tokens_seen': 5002868, 'train_runtime': '2531', 'train_tokens_per_second': '1977'} +{'loss': '0.4365', 'grad_norm': '2.149', 'learning_rate': '3.077e-05', 'epoch': '0.06156', 'num_input_tokens_seen': 5004915, 'train_runtime': '2532', 'train_tokens_per_second': '1977'} +{'loss': '0.5903', 'grad_norm': '2.103', 'learning_rate': '3.078e-05', 'epoch': '0.06159', 'num_input_tokens_seen': 5006962, 'train_runtime': '2533', 'train_tokens_per_second': '1977'} +{'loss': '1.395', 'grad_norm': '2.746', 'learning_rate': '3.079e-05', 'epoch': '0.06161', 'num_input_tokens_seen': 5009009, 'train_runtime': '2534', 'train_tokens_per_second': '1977'} +{'loss': '0.4099', 'grad_norm': '1.979', 'learning_rate': '3.08e-05', 'epoch': '0.06164', 'num_input_tokens_seen': 5011056, 'train_runtime': '2535', 'train_tokens_per_second': '1977'} +{'loss': '0.4597', 'grad_norm': '1.911', 'learning_rate': '3.082e-05', 'epoch': '0.06166', 'num_input_tokens_seen': 5013103, 'train_runtime': '2536', 'train_tokens_per_second': '1977'} +{'loss': '1.183', 'grad_norm': '2.445', 'learning_rate': '3.083e-05', 'epoch': '0.06169', 'num_input_tokens_seen': 5015150, 'train_runtime': '2537', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '2.454', 'learning_rate': '3.084e-05', 'epoch': '0.06171', 'num_input_tokens_seen': 5017197, 'train_runtime': '2538', 'train_tokens_per_second': '1977'} +{'loss': '0.8241', 'grad_norm': '2.568', 'learning_rate': '3.085e-05', 'epoch': '0.06174', 'num_input_tokens_seen': 5019244, 'train_runtime': '2539', 'train_tokens_per_second': '1977'} +{'loss': '2.66', 'grad_norm': '3.705', 'learning_rate': '3.087e-05', 'epoch': '0.06176', 'num_input_tokens_seen': 5021291, 'train_runtime': '2540', 'train_tokens_per_second': '1977'} +{'loss': '0.5978', 'grad_norm': '2.046', 'learning_rate': '3.088e-05', 'epoch': '0.06179', 'num_input_tokens_seen': 5023338, 'train_runtime': '2541', 'train_tokens_per_second': '1977'} +{'loss': '1.174', 'grad_norm': '2.688', 'learning_rate': '3.089e-05', 'epoch': '0.06181', 'num_input_tokens_seen': 5025385, 'train_runtime': '2542', 'train_tokens_per_second': '1977'} +{'loss': '1.138', 'grad_norm': '3.142', 'learning_rate': '3.09e-05', 'epoch': '0.06184', 'num_input_tokens_seen': 5027432, 'train_runtime': '2543', 'train_tokens_per_second': '1977'} +{'loss': '0.6071', 'grad_norm': '2.141', 'learning_rate': '3.092e-05', 'epoch': '0.06186', 'num_input_tokens_seen': 5029479, 'train_runtime': '2544', 'train_tokens_per_second': '1977'} +{'loss': '0.4764', 'grad_norm': '2.484', 'learning_rate': '3.093e-05', 'epoch': '0.06189', 'num_input_tokens_seen': 5031526, 'train_runtime': '2545', 'train_tokens_per_second': '1977'} +{'loss': '0.4368', 'grad_norm': '1.694', 'learning_rate': '3.094e-05', 'epoch': '0.06191', 'num_input_tokens_seen': 5033573, 'train_runtime': '2546', 'train_tokens_per_second': '1977'} +{'loss': '0.6179', 'grad_norm': '2.014', 'learning_rate': '3.095e-05', 'epoch': '0.06194', 'num_input_tokens_seen': 5035620, 'train_runtime': '2547', 'train_tokens_per_second': '1977'} +{'loss': '0.5484', 'grad_norm': '2.137', 'learning_rate': '3.097e-05', 'epoch': '0.06196', 'num_input_tokens_seen': 5037667, 'train_runtime': '2548', 'train_tokens_per_second': '1977'} +{'loss': '0.8794', 'grad_norm': '2.607', 'learning_rate': '3.098e-05', 'epoch': '0.06199', 'num_input_tokens_seen': 5039714, 'train_runtime': '2549', 'train_tokens_per_second': '1977'} +{'loss': '2.578', 'grad_norm': '3.631', 'learning_rate': '3.099e-05', 'epoch': '0.06201', 'num_input_tokens_seen': 5041761, 'train_runtime': '2550', 'train_tokens_per_second': '1977'} +{'loss': '1.338', 'grad_norm': '3.229', 'learning_rate': '3.1e-05', 'epoch': '0.06204', 'num_input_tokens_seen': 5043808, 'train_runtime': '2551', 'train_tokens_per_second': '1977'} +{'loss': '0.8571', 'grad_norm': '2.421', 'learning_rate': '3.102e-05', 'epoch': '0.06206', 'num_input_tokens_seen': 5045855, 'train_runtime': '2552', 'train_tokens_per_second': '1977'} +{'loss': '1.527', 'grad_norm': '3.73', 'learning_rate': '3.103e-05', 'epoch': '0.06209', 'num_input_tokens_seen': 5047902, 'train_runtime': '2553', 'train_tokens_per_second': '1977'} +{'loss': '1.162', 'grad_norm': '3.581', 'learning_rate': '3.104e-05', 'epoch': '0.06211', 'num_input_tokens_seen': 5049949, 'train_runtime': '2554', 'train_tokens_per_second': '1977'} +{'loss': '0.6827', 'grad_norm': '2.474', 'learning_rate': '3.105e-05', 'epoch': '0.06214', 'num_input_tokens_seen': 5051996, 'train_runtime': '2555', 'train_tokens_per_second': '1977'} +{'loss': '1.058', 'grad_norm': '2.637', 'learning_rate': '3.107e-05', 'epoch': '0.06216', 'num_input_tokens_seen': 5054043, 'train_runtime': '2556', 'train_tokens_per_second': '1977'} +{'loss': '1.243', 'grad_norm': '2.781', 'learning_rate': '3.108e-05', 'epoch': '0.06219', 'num_input_tokens_seen': 5056090, 'train_runtime': '2557', 'train_tokens_per_second': '1977'} +{'loss': '0.7688', 'grad_norm': '2.424', 'learning_rate': '3.109e-05', 'epoch': '0.06222', 'num_input_tokens_seen': 5058137, 'train_runtime': '2558', 'train_tokens_per_second': '1977'} +{'loss': '1.24', 'grad_norm': '3.117', 'learning_rate': '3.111e-05', 'epoch': '0.06224', 'num_input_tokens_seen': 5060184, 'train_runtime': '2559', 'train_tokens_per_second': '1977'} +{'loss': '2.857', 'grad_norm': '3.246', 'learning_rate': '3.112e-05', 'epoch': '0.06227', 'num_input_tokens_seen': 5062231, 'train_runtime': '2560', 'train_tokens_per_second': '1977'} +{'loss': '1.962', 'grad_norm': '3.042', 'learning_rate': '3.113e-05', 'epoch': '0.06229', 'num_input_tokens_seen': 5064278, 'train_runtime': '2562', 'train_tokens_per_second': '1977'} +{'loss': '1.265', 'grad_norm': '2.494', 'learning_rate': '3.114e-05', 'epoch': '0.06232', 'num_input_tokens_seen': 5066325, 'train_runtime': '2563', 'train_tokens_per_second': '1977'} +{'loss': '0.689', 'grad_norm': '2.23', 'learning_rate': '3.116e-05', 'epoch': '0.06234', 'num_input_tokens_seen': 5068372, 'train_runtime': '2564', 'train_tokens_per_second': '1977'} +{'loss': '1.824', 'grad_norm': '3.601', 'learning_rate': '3.117e-05', 'epoch': '0.06237', 'num_input_tokens_seen': 5070419, 'train_runtime': '2565', 'train_tokens_per_second': '1977'} +{'loss': '0.8833', 'grad_norm': '3.068', 'learning_rate': '3.118e-05', 'epoch': '0.06239', 'num_input_tokens_seen': 5072466, 'train_runtime': '2566', 'train_tokens_per_second': '1977'} +{'loss': '2.469', 'grad_norm': '3.305', 'learning_rate': '3.119e-05', 'epoch': '0.06242', 'num_input_tokens_seen': 5074513, 'train_runtime': '2567', 'train_tokens_per_second': '1977'} +{'loss': '1.228', 'grad_norm': '2.7', 'learning_rate': '3.121e-05', 'epoch': '0.06244', 'num_input_tokens_seen': 5076560, 'train_runtime': '2568', 'train_tokens_per_second': '1977'} +{'loss': '0.8171', 'grad_norm': '2.302', 'learning_rate': '3.122e-05', 'epoch': '0.06247', 'num_input_tokens_seen': 5078607, 'train_runtime': '2569', 'train_tokens_per_second': '1977'} +{'loss': '0.7255', 'grad_norm': '2.573', 'learning_rate': '3.123e-05', 'epoch': '0.06249', 'num_input_tokens_seen': 5080654, 'train_runtime': '2570', 'train_tokens_per_second': '1977'} +{'loss': '1.752', 'grad_norm': '3.004', 'learning_rate': '3.124e-05', 'epoch': '0.06252', 'num_input_tokens_seen': 5082701, 'train_runtime': '2571', 'train_tokens_per_second': '1977'} +{'loss': '0.4266', 'grad_norm': '1.874', 'learning_rate': '3.126e-05', 'epoch': '0.06254', 'num_input_tokens_seen': 5084748, 'train_runtime': '2572', 'train_tokens_per_second': '1977'} +{'loss': '0.8813', 'grad_norm': '3.412', 'learning_rate': '3.127e-05', 'epoch': '0.06257', 'num_input_tokens_seen': 5086795, 'train_runtime': '2573', 'train_tokens_per_second': '1977'} +{'loss': '1.45', 'grad_norm': '3.072', 'learning_rate': '3.128e-05', 'epoch': '0.06259', 'num_input_tokens_seen': 5088842, 'train_runtime': '2574', 'train_tokens_per_second': '1977'} +{'loss': '1.429', 'grad_norm': '3.097', 'learning_rate': '3.129e-05', 'epoch': '0.06262', 'num_input_tokens_seen': 5090889, 'train_runtime': '2575', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '2.419', 'learning_rate': '3.131e-05', 'epoch': '0.06264', 'num_input_tokens_seen': 5092936, 'train_runtime': '2576', 'train_tokens_per_second': '1977'} +{'loss': '0.4863', 'grad_norm': '1.588', 'learning_rate': '3.132e-05', 'epoch': '0.06267', 'num_input_tokens_seen': 5094983, 'train_runtime': '2577', 'train_tokens_per_second': '1977'} +{'loss': '0.392', 'grad_norm': '1.876', 'learning_rate': '3.133e-05', 'epoch': '0.06269', 'num_input_tokens_seen': 5097030, 'train_runtime': '2578', 'train_tokens_per_second': '1977'} +{'loss': '0.7114', 'grad_norm': '2.415', 'learning_rate': '3.134e-05', 'epoch': '0.06272', 'num_input_tokens_seen': 5099077, 'train_runtime': '2579', 'train_tokens_per_second': '1977'} +{'loss': '0.3865', 'grad_norm': '1.731', 'learning_rate': '3.136e-05', 'epoch': '0.06274', 'num_input_tokens_seen': 5101124, 'train_runtime': '2580', 'train_tokens_per_second': '1977'} +{'loss': '0.4963', 'grad_norm': '1.869', 'learning_rate': '3.137e-05', 'epoch': '0.06277', 'num_input_tokens_seen': 5103171, 'train_runtime': '2581', 'train_tokens_per_second': '1977'} +{'loss': '2.176', 'grad_norm': '3.804', 'learning_rate': '3.138e-05', 'epoch': '0.06279', 'num_input_tokens_seen': 5105218, 'train_runtime': '2582', 'train_tokens_per_second': '1977'} +{'loss': '0.3642', 'grad_norm': '1.547', 'learning_rate': '3.139e-05', 'epoch': '0.06282', 'num_input_tokens_seen': 5107265, 'train_runtime': '2583', 'train_tokens_per_second': '1977'} +{'loss': '1.533', 'grad_norm': '3.711', 'learning_rate': '3.141e-05', 'epoch': '0.06284', 'num_input_tokens_seen': 5109312, 'train_runtime': '2584', 'train_tokens_per_second': '1977'} +{'loss': '1.043', 'grad_norm': '2.484', 'learning_rate': '3.142e-05', 'epoch': '0.06287', 'num_input_tokens_seen': 5111359, 'train_runtime': '2585', 'train_tokens_per_second': '1977'} +{'loss': '2.046', 'grad_norm': '3.942', 'learning_rate': '3.143e-05', 'epoch': '0.06289', 'num_input_tokens_seen': 5113406, 'train_runtime': '2586', 'train_tokens_per_second': '1977'} +{'loss': '2.23', 'grad_norm': '3.128', 'learning_rate': '3.145e-05', 'epoch': '0.06292', 'num_input_tokens_seen': 5115453, 'train_runtime': '2587', 'train_tokens_per_second': '1977'} +{'loss': '0.5361', 'grad_norm': '1.985', 'learning_rate': '3.146e-05', 'epoch': '0.06295', 'num_input_tokens_seen': 5117500, 'train_runtime': '2588', 'train_tokens_per_second': '1977'} +{'loss': '0.5896', 'grad_norm': '1.928', 'learning_rate': '3.147e-05', 'epoch': '0.06297', 'num_input_tokens_seen': 5119547, 'train_runtime': '2589', 'train_tokens_per_second': '1977'} +{'loss': '2.557', 'grad_norm': '3.165', 'learning_rate': '3.148e-05', 'epoch': '0.063', 'num_input_tokens_seen': 5121594, 'train_runtime': '2590', 'train_tokens_per_second': '1977'} +{'loss': '1.861', 'grad_norm': '3.854', 'learning_rate': '3.15e-05', 'epoch': '0.06302', 'num_input_tokens_seen': 5123641, 'train_runtime': '2592', 'train_tokens_per_second': '1977'} +{'loss': '0.3459', 'grad_norm': '1.541', 'learning_rate': '3.151e-05', 'epoch': '0.06305', 'num_input_tokens_seen': 5125688, 'train_runtime': '2593', 'train_tokens_per_second': '1977'} +{'loss': '0.3472', 'grad_norm': '1.614', 'learning_rate': '3.152e-05', 'epoch': '0.06307', 'num_input_tokens_seen': 5127735, 'train_runtime': '2594', 'train_tokens_per_second': '1977'} +{'loss': '0.9137', 'grad_norm': '2.186', 'learning_rate': '3.153e-05', 'epoch': '0.0631', 'num_input_tokens_seen': 5129782, 'train_runtime': '2595', 'train_tokens_per_second': '1977'} +{'loss': '0.3937', 'grad_norm': '2.132', 'learning_rate': '3.155e-05', 'epoch': '0.06312', 'num_input_tokens_seen': 5131829, 'train_runtime': '2596', 'train_tokens_per_second': '1977'} +{'loss': '0.4644', 'grad_norm': '2.284', 'learning_rate': '3.156e-05', 'epoch': '0.06315', 'num_input_tokens_seen': 5133876, 'train_runtime': '2597', 'train_tokens_per_second': '1977'} +{'loss': '1.402', 'grad_norm': '3.178', 'learning_rate': '3.157e-05', 'epoch': '0.06317', 'num_input_tokens_seen': 5135923, 'train_runtime': '2598', 'train_tokens_per_second': '1977'} +{'loss': '2.394', 'grad_norm': '3.433', 'learning_rate': '3.158e-05', 'epoch': '0.0632', 'num_input_tokens_seen': 5137970, 'train_runtime': '2599', 'train_tokens_per_second': '1977'} +{'loss': '0.8516', 'grad_norm': '1.6', 'learning_rate': '3.16e-05', 'epoch': '0.06322', 'num_input_tokens_seen': 5140017, 'train_runtime': '2600', 'train_tokens_per_second': '1977'} +{'loss': '0.7693', 'grad_norm': '2.093', 'learning_rate': '3.161e-05', 'epoch': '0.06325', 'num_input_tokens_seen': 5142064, 'train_runtime': '2601', 'train_tokens_per_second': '1977'} +{'loss': '1.517', 'grad_norm': '3.192', 'learning_rate': '3.162e-05', 'epoch': '0.06327', 'num_input_tokens_seen': 5144111, 'train_runtime': '2602', 'train_tokens_per_second': '1977'} +{'loss': '0.6359', 'grad_norm': '2.065', 'learning_rate': '3.163e-05', 'epoch': '0.0633', 'num_input_tokens_seen': 5146158, 'train_runtime': '2603', 'train_tokens_per_second': '1977'} +{'loss': '1.145', 'grad_norm': '2.462', 'learning_rate': '3.165e-05', 'epoch': '0.06332', 'num_input_tokens_seen': 5148205, 'train_runtime': '2604', 'train_tokens_per_second': '1977'} +{'loss': '0.8697', 'grad_norm': '2.371', 'learning_rate': '3.166e-05', 'epoch': '0.06335', 'num_input_tokens_seen': 5150252, 'train_runtime': '2605', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.892', 'learning_rate': '3.167e-05', 'epoch': '0.06337', 'num_input_tokens_seen': 5152299, 'train_runtime': '2606', 'train_tokens_per_second': '1977'} +{'loss': '0.4558', 'grad_norm': '1.609', 'learning_rate': '3.168e-05', 'epoch': '0.0634', 'num_input_tokens_seen': 5154346, 'train_runtime': '2607', 'train_tokens_per_second': '1977'} +{'loss': '1.675', 'grad_norm': '3.135', 'learning_rate': '3.17e-05', 'epoch': '0.06342', 'num_input_tokens_seen': 5156393, 'train_runtime': '2608', 'train_tokens_per_second': '1977'} +{'loss': '1.674', 'grad_norm': '3.534', 'learning_rate': '3.171e-05', 'epoch': '0.06345', 'num_input_tokens_seen': 5158440, 'train_runtime': '2609', 'train_tokens_per_second': '1977'} +{'loss': '0.7309', 'grad_norm': '2.459', 'learning_rate': '3.172e-05', 'epoch': '0.06347', 'num_input_tokens_seen': 5160487, 'train_runtime': '2610', 'train_tokens_per_second': '1977'} +{'loss': '0.8621', 'grad_norm': '2.251', 'learning_rate': '3.173e-05', 'epoch': '0.0635', 'num_input_tokens_seen': 5162534, 'train_runtime': '2611', 'train_tokens_per_second': '1977'} +{'loss': '0.7221', 'grad_norm': '2.807', 'learning_rate': '3.175e-05', 'epoch': '0.06352', 'num_input_tokens_seen': 5164581, 'train_runtime': '2612', 'train_tokens_per_second': '1977'} +{'loss': '2.121', 'grad_norm': '3.553', 'learning_rate': '3.176e-05', 'epoch': '0.06355', 'num_input_tokens_seen': 5166628, 'train_runtime': '2613', 'train_tokens_per_second': '1977'} +{'loss': '1.518', 'grad_norm': '3.01', 'learning_rate': '3.177e-05', 'epoch': '0.06357', 'num_input_tokens_seen': 5168675, 'train_runtime': '2614', 'train_tokens_per_second': '1977'} +{'loss': '0.4352', 'grad_norm': '1.675', 'learning_rate': '3.178e-05', 'epoch': '0.0636', 'num_input_tokens_seen': 5170722, 'train_runtime': '2615', 'train_tokens_per_second': '1977'} +{'loss': '1.374', 'grad_norm': '2.182', 'learning_rate': '3.18e-05', 'epoch': '0.06363', 'num_input_tokens_seen': 5172769, 'train_runtime': '2616', 'train_tokens_per_second': '1977'} +{'loss': '1.208', 'grad_norm': '3.107', 'learning_rate': '3.181e-05', 'epoch': '0.06365', 'num_input_tokens_seen': 5174816, 'train_runtime': '2617', 'train_tokens_per_second': '1977'} +{'loss': '0.4314', 'grad_norm': '1.461', 'learning_rate': '3.182e-05', 'epoch': '0.06368', 'num_input_tokens_seen': 5176863, 'train_runtime': '2618', 'train_tokens_per_second': '1977'} +{'loss': '0.4586', 'grad_norm': '2.087', 'learning_rate': '3.184e-05', 'epoch': '0.0637', 'num_input_tokens_seen': 5178910, 'train_runtime': '2619', 'train_tokens_per_second': '1977'} +{'loss': '1.46', 'grad_norm': '3.502', 'learning_rate': '3.185e-05', 'epoch': '0.06373', 'num_input_tokens_seen': 5180957, 'train_runtime': '2620', 'train_tokens_per_second': '1977'} +{'loss': '1.561', 'grad_norm': '3.201', 'learning_rate': '3.186e-05', 'epoch': '0.06375', 'num_input_tokens_seen': 5183004, 'train_runtime': '2622', 'train_tokens_per_second': '1977'} +{'loss': '0.8204', 'grad_norm': '2.405', 'learning_rate': '3.187e-05', 'epoch': '0.06378', 'num_input_tokens_seen': 5185051, 'train_runtime': '2623', 'train_tokens_per_second': '1977'} +{'loss': '1.966', 'grad_norm': '4.61', 'learning_rate': '3.189e-05', 'epoch': '0.0638', 'num_input_tokens_seen': 5187098, 'train_runtime': '2624', 'train_tokens_per_second': '1977'} +{'loss': '0.7431', 'grad_norm': '1.832', 'learning_rate': '3.19e-05', 'epoch': '0.06383', 'num_input_tokens_seen': 5189145, 'train_runtime': '2625', 'train_tokens_per_second': '1977'} +{'loss': '2.519', 'grad_norm': '3.135', 'learning_rate': '3.191e-05', 'epoch': '0.06385', 'num_input_tokens_seen': 5191192, 'train_runtime': '2626', 'train_tokens_per_second': '1977'} +{'loss': '0.7962', 'grad_norm': '1.838', 'learning_rate': '3.192e-05', 'epoch': '0.06388', 'num_input_tokens_seen': 5193239, 'train_runtime': '2627', 'train_tokens_per_second': '1977'} +{'loss': '1.916', 'grad_norm': '3.055', 'learning_rate': '3.194e-05', 'epoch': '0.0639', 'num_input_tokens_seen': 5195286, 'train_runtime': '2628', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '2.895', 'learning_rate': '3.195e-05', 'epoch': '0.06393', 'num_input_tokens_seen': 5197333, 'train_runtime': '2629', 'train_tokens_per_second': '1977'} +{'loss': '0.5483', 'grad_norm': '2.098', 'learning_rate': '3.196e-05', 'epoch': '0.06395', 'num_input_tokens_seen': 5199380, 'train_runtime': '2630', 'train_tokens_per_second': '1977'} +{'loss': '0.6981', 'grad_norm': '1.814', 'learning_rate': '3.197e-05', 'epoch': '0.06398', 'num_input_tokens_seen': 5201427, 'train_runtime': '2631', 'train_tokens_per_second': '1977'} +{'loss': '0.989', 'grad_norm': '2.582', 'learning_rate': '3.199e-05', 'epoch': '0.064', 'num_input_tokens_seen': 5203474, 'train_runtime': '2632', 'train_tokens_per_second': '1977'} +{'loss': '1.086', 'grad_norm': '2.344', 'learning_rate': '3.2e-05', 'epoch': '0.06403', 'num_input_tokens_seen': 5205521, 'train_runtime': '2633', 'train_tokens_per_second': '1977'} +{'loss': '1.42', 'grad_norm': '2.656', 'learning_rate': '3.201e-05', 'epoch': '0.06405', 'num_input_tokens_seen': 5207568, 'train_runtime': '2634', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '2.91', 'learning_rate': '3.202e-05', 'epoch': '0.06408', 'num_input_tokens_seen': 5209615, 'train_runtime': '2635', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '2.979', 'learning_rate': '3.204e-05', 'epoch': '0.0641', 'num_input_tokens_seen': 5211662, 'train_runtime': '2636', 'train_tokens_per_second': '1977'} +{'loss': '1.128', 'grad_norm': '2.725', 'learning_rate': '3.205e-05', 'epoch': '0.06413', 'num_input_tokens_seen': 5213709, 'train_runtime': '2637', 'train_tokens_per_second': '1977'} +{'loss': '0.482', 'grad_norm': '1.758', 'learning_rate': '3.206e-05', 'epoch': '0.06415', 'num_input_tokens_seen': 5215756, 'train_runtime': '2638', 'train_tokens_per_second': '1977'} +{'loss': '2.254', 'grad_norm': '3.451', 'learning_rate': '3.207e-05', 'epoch': '0.06418', 'num_input_tokens_seen': 5217803, 'train_runtime': '2639', 'train_tokens_per_second': '1977'} +{'loss': '0.5057', 'grad_norm': '2.133', 'learning_rate': '3.209e-05', 'epoch': '0.0642', 'num_input_tokens_seen': 5219850, 'train_runtime': '2640', 'train_tokens_per_second': '1977'} +{'loss': '2.023', 'grad_norm': '2.746', 'learning_rate': '3.21e-05', 'epoch': '0.06423', 'num_input_tokens_seen': 5221897, 'train_runtime': '2641', 'train_tokens_per_second': '1977'} +{'loss': '0.7283', 'grad_norm': '2.308', 'learning_rate': '3.211e-05', 'epoch': '0.06425', 'num_input_tokens_seen': 5223944, 'train_runtime': '2642', 'train_tokens_per_second': '1977'} +{'loss': '0.4611', 'grad_norm': '1.672', 'learning_rate': '3.212e-05', 'epoch': '0.06428', 'num_input_tokens_seen': 5225991, 'train_runtime': '2643', 'train_tokens_per_second': '1977'} +{'loss': '0.8542', 'grad_norm': '1.832', 'learning_rate': '3.214e-05', 'epoch': '0.0643', 'num_input_tokens_seen': 5228038, 'train_runtime': '2644', 'train_tokens_per_second': '1977'} +{'loss': '0.8176', 'grad_norm': '2.688', 'learning_rate': '3.215e-05', 'epoch': '0.06433', 'num_input_tokens_seen': 5230085, 'train_runtime': '2645', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '4.251', 'learning_rate': '3.216e-05', 'epoch': '0.06436', 'num_input_tokens_seen': 5232132, 'train_runtime': '2646', 'train_tokens_per_second': '1977'} +{'loss': '0.9629', 'grad_norm': '3.058', 'learning_rate': '3.218e-05', 'epoch': '0.06438', 'num_input_tokens_seen': 5234179, 'train_runtime': '2647', 'train_tokens_per_second': '1977'} +{'loss': '0.3961', 'grad_norm': '1.895', 'learning_rate': '3.219e-05', 'epoch': '0.06441', 'num_input_tokens_seen': 5236226, 'train_runtime': '2648', 'train_tokens_per_second': '1977'} +{'loss': '1.436', 'grad_norm': '3.223', 'learning_rate': '3.22e-05', 'epoch': '0.06443', 'num_input_tokens_seen': 5238273, 'train_runtime': '2649', 'train_tokens_per_second': '1977'} +{'loss': '1.143', 'grad_norm': '3.237', 'learning_rate': '3.221e-05', 'epoch': '0.06446', 'num_input_tokens_seen': 5240320, 'train_runtime': '2650', 'train_tokens_per_second': '1977'} +{'loss': '1.003', 'grad_norm': '2.483', 'learning_rate': '3.223e-05', 'epoch': '0.06448', 'num_input_tokens_seen': 5242367, 'train_runtime': '2652', 'train_tokens_per_second': '1977'} +{'loss': '1.283', 'grad_norm': '3.161', 'learning_rate': '3.224e-05', 'epoch': '0.06451', 'num_input_tokens_seen': 5244414, 'train_runtime': '2653', 'train_tokens_per_second': '1977'} +{'loss': '1.184', 'grad_norm': '3.068', 'learning_rate': '3.225e-05', 'epoch': '0.06453', 'num_input_tokens_seen': 5246461, 'train_runtime': '2654', 'train_tokens_per_second': '1977'} +{'loss': '1.552', 'grad_norm': '3.159', 'learning_rate': '3.226e-05', 'epoch': '0.06456', 'num_input_tokens_seen': 5248508, 'train_runtime': '2655', 'train_tokens_per_second': '1977'} +{'loss': '0.452', 'grad_norm': '1.909', 'learning_rate': '3.228e-05', 'epoch': '0.06458', 'num_input_tokens_seen': 5250555, 'train_runtime': '2656', 'train_tokens_per_second': '1977'} +{'loss': '0.7464', 'grad_norm': '2.444', 'learning_rate': '3.229e-05', 'epoch': '0.06461', 'num_input_tokens_seen': 5252602, 'train_runtime': '2657', 'train_tokens_per_second': '1977'} +{'loss': '1.534', 'grad_norm': '3.838', 'learning_rate': '3.23e-05', 'epoch': '0.06463', 'num_input_tokens_seen': 5254649, 'train_runtime': '2658', 'train_tokens_per_second': '1977'} +{'loss': '1.488', 'grad_norm': '2.975', 'learning_rate': '3.231e-05', 'epoch': '0.06466', 'num_input_tokens_seen': 5256696, 'train_runtime': '2659', 'train_tokens_per_second': '1977'} +{'loss': '0.4611', 'grad_norm': '1.367', 'learning_rate': '3.233e-05', 'epoch': '0.06468', 'num_input_tokens_seen': 5258743, 'train_runtime': '2660', 'train_tokens_per_second': '1977'} +{'loss': '0.517', 'grad_norm': '1.752', 'learning_rate': '3.234e-05', 'epoch': '0.06471', 'num_input_tokens_seen': 5260790, 'train_runtime': '2661', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '2.76', 'learning_rate': '3.235e-05', 'epoch': '0.06473', 'num_input_tokens_seen': 5262837, 'train_runtime': '2662', 'train_tokens_per_second': '1977'} +{'loss': '0.8672', 'grad_norm': '2.287', 'learning_rate': '3.236e-05', 'epoch': '0.06476', 'num_input_tokens_seen': 5264884, 'train_runtime': '2663', 'train_tokens_per_second': '1977'} +{'loss': '1.521', 'grad_norm': '2.61', 'learning_rate': '3.238e-05', 'epoch': '0.06478', 'num_input_tokens_seen': 5266931, 'train_runtime': '2664', 'train_tokens_per_second': '1977'} +{'loss': '0.6406', 'grad_norm': '1.837', 'learning_rate': '3.239e-05', 'epoch': '0.06481', 'num_input_tokens_seen': 5268978, 'train_runtime': '2665', 'train_tokens_per_second': '1977'} +{'loss': '0.9707', 'grad_norm': '2.545', 'learning_rate': '3.24e-05', 'epoch': '0.06483', 'num_input_tokens_seen': 5271025, 'train_runtime': '2666', 'train_tokens_per_second': '1977'} +{'loss': '0.4604', 'grad_norm': '1.795', 'learning_rate': '3.241e-05', 'epoch': '0.06486', 'num_input_tokens_seen': 5273072, 'train_runtime': '2667', 'train_tokens_per_second': '1977'} +{'loss': '0.5316', 'grad_norm': '2.043', 'learning_rate': '3.243e-05', 'epoch': '0.06488', 'num_input_tokens_seen': 5275119, 'train_runtime': '2668', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '2.593', 'learning_rate': '3.244e-05', 'epoch': '0.06491', 'num_input_tokens_seen': 5277166, 'train_runtime': '2669', 'train_tokens_per_second': '1977'} +{'loss': '1.08', 'grad_norm': '3.058', 'learning_rate': '3.245e-05', 'epoch': '0.06493', 'num_input_tokens_seen': 5279213, 'train_runtime': '2670', 'train_tokens_per_second': '1977'} +{'loss': '2.119', 'grad_norm': '3.655', 'learning_rate': '3.246e-05', 'epoch': '0.06496', 'num_input_tokens_seen': 5281260, 'train_runtime': '2671', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '2.392', 'learning_rate': '3.248e-05', 'epoch': '0.06498', 'num_input_tokens_seen': 5283307, 'train_runtime': '2672', 'train_tokens_per_second': '1977'} +{'loss': '0.3108', 'grad_norm': '1.838', 'learning_rate': '3.249e-05', 'epoch': '0.06501', 'num_input_tokens_seen': 5285354, 'train_runtime': '2673', 'train_tokens_per_second': '1977'} +{'loss': '1.176', 'grad_norm': '2.33', 'learning_rate': '3.25e-05', 'epoch': '0.06504', 'num_input_tokens_seen': 5287401, 'train_runtime': '2674', 'train_tokens_per_second': '1977'} +{'loss': '0.4954', 'grad_norm': '2.048', 'learning_rate': '3.252e-05', 'epoch': '0.06506', 'num_input_tokens_seen': 5289448, 'train_runtime': '2675', 'train_tokens_per_second': '1977'} +{'loss': '1.572', 'grad_norm': '3.881', 'learning_rate': '3.253e-05', 'epoch': '0.06509', 'num_input_tokens_seen': 5291495, 'train_runtime': '2676', 'train_tokens_per_second': '1977'} +{'loss': '0.4178', 'grad_norm': '1.739', 'learning_rate': '3.254e-05', 'epoch': '0.06511', 'num_input_tokens_seen': 5293542, 'train_runtime': '2677', 'train_tokens_per_second': '1977'} +{'loss': '0.8932', 'grad_norm': '2.501', 'learning_rate': '3.255e-05', 'epoch': '0.06514', 'num_input_tokens_seen': 5295589, 'train_runtime': '2678', 'train_tokens_per_second': '1977'} +{'loss': '0.4593', 'grad_norm': '2.565', 'learning_rate': '3.257e-05', 'epoch': '0.06516', 'num_input_tokens_seen': 5297636, 'train_runtime': '2679', 'train_tokens_per_second': '1977'} +{'loss': '0.3137', 'grad_norm': '1.774', 'learning_rate': '3.258e-05', 'epoch': '0.06519', 'num_input_tokens_seen': 5299683, 'train_runtime': '2680', 'train_tokens_per_second': '1977'} +{'loss': '0.752', 'grad_norm': '2.283', 'learning_rate': '3.259e-05', 'epoch': '0.06521', 'num_input_tokens_seen': 5301730, 'train_runtime': '2682', 'train_tokens_per_second': '1977'} +{'loss': '0.6355', 'grad_norm': '1.926', 'learning_rate': '3.26e-05', 'epoch': '0.06524', 'num_input_tokens_seen': 5303777, 'train_runtime': '2683', 'train_tokens_per_second': '1977'} +{'loss': '0.9103', 'grad_norm': '2.185', 'learning_rate': '3.262e-05', 'epoch': '0.06526', 'num_input_tokens_seen': 5305824, 'train_runtime': '2684', 'train_tokens_per_second': '1977'} +{'loss': '0.8831', 'grad_norm': '2.224', 'learning_rate': '3.263e-05', 'epoch': '0.06529', 'num_input_tokens_seen': 5307871, 'train_runtime': '2685', 'train_tokens_per_second': '1977'} +{'loss': '0.7796', 'grad_norm': '2.201', 'learning_rate': '3.264e-05', 'epoch': '0.06531', 'num_input_tokens_seen': 5309918, 'train_runtime': '2686', 'train_tokens_per_second': '1977'} +{'loss': '1.282', 'grad_norm': '3.234', 'learning_rate': '3.265e-05', 'epoch': '0.06534', 'num_input_tokens_seen': 5311965, 'train_runtime': '2687', 'train_tokens_per_second': '1977'} +{'loss': '0.5017', 'grad_norm': '2.221', 'learning_rate': '3.267e-05', 'epoch': '0.06536', 'num_input_tokens_seen': 5314012, 'train_runtime': '2688', 'train_tokens_per_second': '1977'} +{'loss': '0.9714', 'grad_norm': '2.268', 'learning_rate': '3.268e-05', 'epoch': '0.06539', 'num_input_tokens_seen': 5316059, 'train_runtime': '2689', 'train_tokens_per_second': '1977'} +{'loss': '0.6672', 'grad_norm': '2.296', 'learning_rate': '3.269e-05', 'epoch': '0.06541', 'num_input_tokens_seen': 5318106, 'train_runtime': '2690', 'train_tokens_per_second': '1977'} +{'loss': '2.253', 'grad_norm': '3.268', 'learning_rate': '3.27e-05', 'epoch': '0.06544', 'num_input_tokens_seen': 5320153, 'train_runtime': '2691', 'train_tokens_per_second': '1977'} +{'loss': '0.5024', 'grad_norm': '1.807', 'learning_rate': '3.272e-05', 'epoch': '0.06546', 'num_input_tokens_seen': 5322200, 'train_runtime': '2692', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '2.872', 'learning_rate': '3.273e-05', 'epoch': '0.06549', 'num_input_tokens_seen': 5324247, 'train_runtime': '2693', 'train_tokens_per_second': '1977'} +{'loss': '0.7948', 'grad_norm': '2.272', 'learning_rate': '3.274e-05', 'epoch': '0.06551', 'num_input_tokens_seen': 5326294, 'train_runtime': '2694', 'train_tokens_per_second': '1977'} +{'loss': '1.063', 'grad_norm': '2.617', 'learning_rate': '3.275e-05', 'epoch': '0.06554', 'num_input_tokens_seen': 5328341, 'train_runtime': '2695', 'train_tokens_per_second': '1977'} +{'loss': '1.142', 'grad_norm': '2.635', 'learning_rate': '3.277e-05', 'epoch': '0.06556', 'num_input_tokens_seen': 5330388, 'train_runtime': '2696', 'train_tokens_per_second': '1977'} +{'loss': '2.065', 'grad_norm': '3.658', 'learning_rate': '3.278e-05', 'epoch': '0.06559', 'num_input_tokens_seen': 5332435, 'train_runtime': '2697', 'train_tokens_per_second': '1977'} +{'loss': '0.4418', 'grad_norm': '2.007', 'learning_rate': '3.279e-05', 'epoch': '0.06561', 'num_input_tokens_seen': 5334482, 'train_runtime': '2698', 'train_tokens_per_second': '1977'} +{'loss': '0.4449', 'grad_norm': '1.924', 'learning_rate': '3.28e-05', 'epoch': '0.06564', 'num_input_tokens_seen': 5336529, 'train_runtime': '2699', 'train_tokens_per_second': '1977'} +{'loss': '0.892', 'grad_norm': '2.634', 'learning_rate': '3.282e-05', 'epoch': '0.06566', 'num_input_tokens_seen': 5338576, 'train_runtime': '2700', 'train_tokens_per_second': '1977'} +{'loss': '1.403', 'grad_norm': '3.233', 'learning_rate': '3.283e-05', 'epoch': '0.06569', 'num_input_tokens_seen': 5340623, 'train_runtime': '2701', 'train_tokens_per_second': '1977'} +{'loss': '2.688', 'grad_norm': '3.627', 'learning_rate': '3.284e-05', 'epoch': '0.06571', 'num_input_tokens_seen': 5342670, 'train_runtime': '2702', 'train_tokens_per_second': '1977'} +{'loss': '0.8029', 'grad_norm': '2.6', 'learning_rate': '3.285e-05', 'epoch': '0.06574', 'num_input_tokens_seen': 5344717, 'train_runtime': '2703', 'train_tokens_per_second': '1977'} +{'loss': '1.405', 'grad_norm': '2.904', 'learning_rate': '3.287e-05', 'epoch': '0.06577', 'num_input_tokens_seen': 5346764, 'train_runtime': '2704', 'train_tokens_per_second': '1977'} +{'loss': '0.4187', 'grad_norm': '1.711', 'learning_rate': '3.288e-05', 'epoch': '0.06579', 'num_input_tokens_seen': 5348811, 'train_runtime': '2706', 'train_tokens_per_second': '1977'} +{'loss': '1.3', 'grad_norm': '2.919', 'learning_rate': '3.289e-05', 'epoch': '0.06582', 'num_input_tokens_seen': 5350858, 'train_runtime': '2707', 'train_tokens_per_second': '1977'} +{'loss': '0.9676', 'grad_norm': '2.525', 'learning_rate': '3.291e-05', 'epoch': '0.06584', 'num_input_tokens_seen': 5352905, 'train_runtime': '2708', 'train_tokens_per_second': '1977'} +{'loss': '0.481', 'grad_norm': '2.029', 'learning_rate': '3.292e-05', 'epoch': '0.06587', 'num_input_tokens_seen': 5354952, 'train_runtime': '2709', 'train_tokens_per_second': '1977'} +{'loss': '0.8654', 'grad_norm': '2.183', 'learning_rate': '3.293e-05', 'epoch': '0.06589', 'num_input_tokens_seen': 5356999, 'train_runtime': '2710', 'train_tokens_per_second': '1977'} +{'loss': '1.079', 'grad_norm': '2.583', 'learning_rate': '3.294e-05', 'epoch': '0.06592', 'num_input_tokens_seen': 5359046, 'train_runtime': '2711', 'train_tokens_per_second': '1977'} +{'loss': '1.177', 'grad_norm': '1.942', 'learning_rate': '3.296e-05', 'epoch': '0.06594', 'num_input_tokens_seen': 5361093, 'train_runtime': '2712', 'train_tokens_per_second': '1977'} +{'loss': '1.487', 'grad_norm': '2.697', 'learning_rate': '3.297e-05', 'epoch': '0.06597', 'num_input_tokens_seen': 5363140, 'train_runtime': '2713', 'train_tokens_per_second': '1977'} +{'loss': '0.6875', 'grad_norm': '2.335', 'learning_rate': '3.298e-05', 'epoch': '0.06599', 'num_input_tokens_seen': 5365187, 'train_runtime': '2714', 'train_tokens_per_second': '1977'} +{'loss': '0.5127', 'grad_norm': '1.871', 'learning_rate': '3.299e-05', 'epoch': '0.06602', 'num_input_tokens_seen': 5367234, 'train_runtime': '2715', 'train_tokens_per_second': '1977'} +{'loss': '1.982', 'grad_norm': '2.785', 'learning_rate': '3.301e-05', 'epoch': '0.06604', 'num_input_tokens_seen': 5369281, 'train_runtime': '2716', 'train_tokens_per_second': '1977'} +{'loss': '0.5526', 'grad_norm': '2.209', 'learning_rate': '3.302e-05', 'epoch': '0.06607', 'num_input_tokens_seen': 5371328, 'train_runtime': '2717', 'train_tokens_per_second': '1977'} +{'loss': '0.9656', 'grad_norm': '2.08', 'learning_rate': '3.303e-05', 'epoch': '0.06609', 'num_input_tokens_seen': 5373375, 'train_runtime': '2718', 'train_tokens_per_second': '1977'} +{'loss': '0.9829', 'grad_norm': '2.21', 'learning_rate': '3.304e-05', 'epoch': '0.06612', 'num_input_tokens_seen': 5375422, 'train_runtime': '2719', 'train_tokens_per_second': '1977'} +{'loss': '0.4925', 'grad_norm': '1.624', 'learning_rate': '3.306e-05', 'epoch': '0.06614', 'num_input_tokens_seen': 5377469, 'train_runtime': '2720', 'train_tokens_per_second': '1977'} +{'loss': '0.9568', 'grad_norm': '2.451', 'learning_rate': '3.307e-05', 'epoch': '0.06617', 'num_input_tokens_seen': 5379516, 'train_runtime': '2721', 'train_tokens_per_second': '1977'} +{'loss': '1.094', 'grad_norm': '2.436', 'learning_rate': '3.308e-05', 'epoch': '0.06619', 'num_input_tokens_seen': 5381563, 'train_runtime': '2722', 'train_tokens_per_second': '1977'} +{'loss': '1.641', 'grad_norm': '3.25', 'learning_rate': '3.309e-05', 'epoch': '0.06622', 'num_input_tokens_seen': 5383610, 'train_runtime': '2723', 'train_tokens_per_second': '1977'} +{'loss': '0.8854', 'grad_norm': '2.414', 'learning_rate': '3.311e-05', 'epoch': '0.06624', 'num_input_tokens_seen': 5385657, 'train_runtime': '2724', 'train_tokens_per_second': '1977'} +{'loss': '2.096', 'grad_norm': '2.904', 'learning_rate': '3.312e-05', 'epoch': '0.06627', 'num_input_tokens_seen': 5387704, 'train_runtime': '2725', 'train_tokens_per_second': '1977'} +{'loss': '1.228', 'grad_norm': '2.636', 'learning_rate': '3.313e-05', 'epoch': '0.06629', 'num_input_tokens_seen': 5389751, 'train_runtime': '2726', 'train_tokens_per_second': '1977'} +{'loss': '0.7062', 'grad_norm': '2.125', 'learning_rate': '3.314e-05', 'epoch': '0.06632', 'num_input_tokens_seen': 5391798, 'train_runtime': '2727', 'train_tokens_per_second': '1977'} +{'loss': '1.311', 'grad_norm': '2.455', 'learning_rate': '3.316e-05', 'epoch': '0.06634', 'num_input_tokens_seen': 5393845, 'train_runtime': '2728', 'train_tokens_per_second': '1977'} +{'loss': '0.7971', 'grad_norm': '2.062', 'learning_rate': '3.317e-05', 'epoch': '0.06637', 'num_input_tokens_seen': 5395892, 'train_runtime': '2729', 'train_tokens_per_second': '1977'} +{'loss': '0.9827', 'grad_norm': '2.447', 'learning_rate': '3.318e-05', 'epoch': '0.06639', 'num_input_tokens_seen': 5397939, 'train_runtime': '2730', 'train_tokens_per_second': '1977'} +{'loss': '0.9622', 'grad_norm': '2.362', 'learning_rate': '3.319e-05', 'epoch': '0.06642', 'num_input_tokens_seen': 5399986, 'train_runtime': '2731', 'train_tokens_per_second': '1977'} +{'loss': '1.051', 'grad_norm': '2.755', 'learning_rate': '3.321e-05', 'epoch': '0.06645', 'num_input_tokens_seen': 5402033, 'train_runtime': '2732', 'train_tokens_per_second': '1977'} +{'loss': '1.1', 'grad_norm': '2.916', 'learning_rate': '3.322e-05', 'epoch': '0.06647', 'num_input_tokens_seen': 5404080, 'train_runtime': '2733', 'train_tokens_per_second': '1977'} +{'loss': '0.6006', 'grad_norm': '2.551', 'learning_rate': '3.323e-05', 'epoch': '0.0665', 'num_input_tokens_seen': 5406127, 'train_runtime': '2734', 'train_tokens_per_second': '1977'} +{'loss': '0.7742', 'grad_norm': '2.188', 'learning_rate': '3.325e-05', 'epoch': '0.06652', 'num_input_tokens_seen': 5408174, 'train_runtime': '2736', 'train_tokens_per_second': '1977'} +{'loss': '0.7863', 'grad_norm': '2.141', 'learning_rate': '3.326e-05', 'epoch': '0.06655', 'num_input_tokens_seen': 5410221, 'train_runtime': '2737', 'train_tokens_per_second': '1977'} +{'loss': '1.669', 'grad_norm': '3.275', 'learning_rate': '3.327e-05', 'epoch': '0.06657', 'num_input_tokens_seen': 5412268, 'train_runtime': '2738', 'train_tokens_per_second': '1977'} +{'loss': '0.6618', 'grad_norm': '2.812', 'learning_rate': '3.328e-05', 'epoch': '0.0666', 'num_input_tokens_seen': 5414315, 'train_runtime': '2739', 'train_tokens_per_second': '1977'} +{'loss': '0.8005', 'grad_norm': '1.951', 'learning_rate': '3.33e-05', 'epoch': '0.06662', 'num_input_tokens_seen': 5416362, 'train_runtime': '2740', 'train_tokens_per_second': '1977'} +{'loss': '1.655', 'grad_norm': '3.047', 'learning_rate': '3.331e-05', 'epoch': '0.06665', 'num_input_tokens_seen': 5418409, 'train_runtime': '2741', 'train_tokens_per_second': '1977'} +{'loss': '0.5308', 'grad_norm': '1.892', 'learning_rate': '3.332e-05', 'epoch': '0.06667', 'num_input_tokens_seen': 5420456, 'train_runtime': '2742', 'train_tokens_per_second': '1977'} +{'loss': '1.266', 'grad_norm': '2.659', 'learning_rate': '3.333e-05', 'epoch': '0.0667', 'num_input_tokens_seen': 5422503, 'train_runtime': '2743', 'train_tokens_per_second': '1977'} +{'loss': '0.9646', 'grad_norm': '2.864', 'learning_rate': '3.335e-05', 'epoch': '0.06672', 'num_input_tokens_seen': 5424550, 'train_runtime': '2744', 'train_tokens_per_second': '1977'} +{'loss': '1.166', 'grad_norm': '2.301', 'learning_rate': '3.336e-05', 'epoch': '0.06675', 'num_input_tokens_seen': 5426597, 'train_runtime': '2745', 'train_tokens_per_second': '1977'} +{'loss': '0.8399', 'grad_norm': '2.5', 'learning_rate': '3.337e-05', 'epoch': '0.06677', 'num_input_tokens_seen': 5428644, 'train_runtime': '2746', 'train_tokens_per_second': '1977'} +{'loss': '1.733', 'grad_norm': '3.57', 'learning_rate': '3.338e-05', 'epoch': '0.0668', 'num_input_tokens_seen': 5430691, 'train_runtime': '2747', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '2.5', 'learning_rate': '3.34e-05', 'epoch': '0.06682', 'num_input_tokens_seen': 5432738, 'train_runtime': '2748', 'train_tokens_per_second': '1977'} +{'loss': '0.7625', 'grad_norm': '2.152', 'learning_rate': '3.341e-05', 'epoch': '0.06685', 'num_input_tokens_seen': 5434785, 'train_runtime': '2749', 'train_tokens_per_second': '1977'} +{'loss': '1.676', 'grad_norm': '2.883', 'learning_rate': '3.342e-05', 'epoch': '0.06687', 'num_input_tokens_seen': 5436832, 'train_runtime': '2750', 'train_tokens_per_second': '1977'} +{'loss': '0.6406', 'grad_norm': '2.316', 'learning_rate': '3.343e-05', 'epoch': '0.0669', 'num_input_tokens_seen': 5438879, 'train_runtime': '2751', 'train_tokens_per_second': '1977'} +{'loss': '0.8332', 'grad_norm': '2.331', 'learning_rate': '3.345e-05', 'epoch': '0.06692', 'num_input_tokens_seen': 5440926, 'train_runtime': '2752', 'train_tokens_per_second': '1977'} +{'loss': '1.766', 'grad_norm': '3.233', 'learning_rate': '3.346e-05', 'epoch': '0.06695', 'num_input_tokens_seen': 5442973, 'train_runtime': '2753', 'train_tokens_per_second': '1977'} +{'loss': '0.7069', 'grad_norm': '2.183', 'learning_rate': '3.347e-05', 'epoch': '0.06697', 'num_input_tokens_seen': 5445020, 'train_runtime': '2754', 'train_tokens_per_second': '1977'} +{'loss': '0.4254', 'grad_norm': '1.914', 'learning_rate': '3.348e-05', 'epoch': '0.067', 'num_input_tokens_seen': 5447067, 'train_runtime': '2755', 'train_tokens_per_second': '1977'} +{'loss': '0.9152', 'grad_norm': '2.261', 'learning_rate': '3.35e-05', 'epoch': '0.06702', 'num_input_tokens_seen': 5449114, 'train_runtime': '2756', 'train_tokens_per_second': '1977'} +{'loss': '0.782', 'grad_norm': '1.848', 'learning_rate': '3.351e-05', 'epoch': '0.06705', 'num_input_tokens_seen': 5451161, 'train_runtime': '2757', 'train_tokens_per_second': '1977'} +{'loss': '0.699', 'grad_norm': '2.07', 'learning_rate': '3.352e-05', 'epoch': '0.06707', 'num_input_tokens_seen': 5453208, 'train_runtime': '2758', 'train_tokens_per_second': '1977'} +{'loss': '0.6619', 'grad_norm': '2.071', 'learning_rate': '3.353e-05', 'epoch': '0.0671', 'num_input_tokens_seen': 5455255, 'train_runtime': '2759', 'train_tokens_per_second': '1977'} +{'loss': '0.4726', 'grad_norm': '1.753', 'learning_rate': '3.355e-05', 'epoch': '0.06712', 'num_input_tokens_seen': 5457302, 'train_runtime': '2760', 'train_tokens_per_second': '1977'} +{'loss': '0.4498', 'grad_norm': '1.887', 'learning_rate': '3.356e-05', 'epoch': '0.06715', 'num_input_tokens_seen': 5459349, 'train_runtime': '2761', 'train_tokens_per_second': '1977'} +{'loss': '0.8421', 'grad_norm': '2.417', 'learning_rate': '3.357e-05', 'epoch': '0.06718', 'num_input_tokens_seen': 5461396, 'train_runtime': '2762', 'train_tokens_per_second': '1977'} +{'loss': '1.295', 'grad_norm': '2.584', 'learning_rate': '3.359e-05', 'epoch': '0.0672', 'num_input_tokens_seen': 5463443, 'train_runtime': '2763', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '2.519', 'learning_rate': '3.36e-05', 'epoch': '0.06723', 'num_input_tokens_seen': 5465490, 'train_runtime': '2764', 'train_tokens_per_second': '1977'} +{'loss': '2.43', 'grad_norm': '3.677', 'learning_rate': '3.361e-05', 'epoch': '0.06725', 'num_input_tokens_seen': 5467537, 'train_runtime': '2766', 'train_tokens_per_second': '1977'} +{'loss': '1.948', 'grad_norm': '3.08', 'learning_rate': '3.362e-05', 'epoch': '0.06728', 'num_input_tokens_seen': 5469584, 'train_runtime': '2767', 'train_tokens_per_second': '1977'} +{'loss': '2.038', 'grad_norm': '3.953', 'learning_rate': '3.364e-05', 'epoch': '0.0673', 'num_input_tokens_seen': 5471631, 'train_runtime': '2768', 'train_tokens_per_second': '1977'} +{'loss': '0.4375', 'grad_norm': '1.95', 'learning_rate': '3.365e-05', 'epoch': '0.06733', 'num_input_tokens_seen': 5473678, 'train_runtime': '2769', 'train_tokens_per_second': '1977'} +{'loss': '1.093', 'grad_norm': '2.57', 'learning_rate': '3.366e-05', 'epoch': '0.06735', 'num_input_tokens_seen': 5475725, 'train_runtime': '2770', 'train_tokens_per_second': '1977'} +{'loss': '0.9321', 'grad_norm': '2.49', 'learning_rate': '3.367e-05', 'epoch': '0.06738', 'num_input_tokens_seen': 5477772, 'train_runtime': '2771', 'train_tokens_per_second': '1977'} +{'loss': '0.6864', 'grad_norm': '2.221', 'learning_rate': '3.369e-05', 'epoch': '0.0674', 'num_input_tokens_seen': 5479819, 'train_runtime': '2772', 'train_tokens_per_second': '1977'} +{'loss': '0.3981', 'grad_norm': '1.783', 'learning_rate': '3.37e-05', 'epoch': '0.06743', 'num_input_tokens_seen': 5481866, 'train_runtime': '2773', 'train_tokens_per_second': '1977'} +{'loss': '1.06', 'grad_norm': '2.581', 'learning_rate': '3.371e-05', 'epoch': '0.06745', 'num_input_tokens_seen': 5483913, 'train_runtime': '2774', 'train_tokens_per_second': '1977'} +{'loss': '0.8188', 'grad_norm': '2.178', 'learning_rate': '3.372e-05', 'epoch': '0.06748', 'num_input_tokens_seen': 5485960, 'train_runtime': '2775', 'train_tokens_per_second': '1977'} +{'loss': '0.7178', 'grad_norm': '2.131', 'learning_rate': '3.374e-05', 'epoch': '0.0675', 'num_input_tokens_seen': 5488007, 'train_runtime': '2776', 'train_tokens_per_second': '1977'} +{'loss': '1.535', 'grad_norm': '2.848', 'learning_rate': '3.375e-05', 'epoch': '0.06753', 'num_input_tokens_seen': 5490054, 'train_runtime': '2777', 'train_tokens_per_second': '1977'} +{'loss': '0.6482', 'grad_norm': '2.61', 'learning_rate': '3.376e-05', 'epoch': '0.06755', 'num_input_tokens_seen': 5492101, 'train_runtime': '2778', 'train_tokens_per_second': '1977'} +{'loss': '1.054', 'grad_norm': '2.277', 'learning_rate': '3.377e-05', 'epoch': '0.06758', 'num_input_tokens_seen': 5494148, 'train_runtime': '2779', 'train_tokens_per_second': '1977'} +{'loss': '0.6851', 'grad_norm': '2.066', 'learning_rate': '3.379e-05', 'epoch': '0.0676', 'num_input_tokens_seen': 5496195, 'train_runtime': '2780', 'train_tokens_per_second': '1977'} +{'loss': '2.486', 'grad_norm': '5.515', 'learning_rate': '3.38e-05', 'epoch': '0.06763', 'num_input_tokens_seen': 5498242, 'train_runtime': '2781', 'train_tokens_per_second': '1977'} +{'loss': '1.284', 'grad_norm': '3.082', 'learning_rate': '3.381e-05', 'epoch': '0.06765', 'num_input_tokens_seen': 5500289, 'train_runtime': '2782', 'train_tokens_per_second': '1977'} +{'loss': '1.837', 'grad_norm': '3.268', 'learning_rate': '3.382e-05', 'epoch': '0.06768', 'num_input_tokens_seen': 5502336, 'train_runtime': '2783', 'train_tokens_per_second': '1977'} +{'loss': '0.3365', 'grad_norm': '1.698', 'learning_rate': '3.384e-05', 'epoch': '0.0677', 'num_input_tokens_seen': 5504383, 'train_runtime': '2784', 'train_tokens_per_second': '1977'} +{'loss': '0.7199', 'grad_norm': '2.214', 'learning_rate': '3.385e-05', 'epoch': '0.06773', 'num_input_tokens_seen': 5506430, 'train_runtime': '2785', 'train_tokens_per_second': '1977'} +{'loss': '0.4878', 'grad_norm': '1.976', 'learning_rate': '3.386e-05', 'epoch': '0.06775', 'num_input_tokens_seen': 5508477, 'train_runtime': '2786', 'train_tokens_per_second': '1977'} +{'loss': '1.252', 'grad_norm': '2.753', 'learning_rate': '3.387e-05', 'epoch': '0.06778', 'num_input_tokens_seen': 5510524, 'train_runtime': '2787', 'train_tokens_per_second': '1977'} +{'loss': '0.6362', 'grad_norm': '2.402', 'learning_rate': '3.389e-05', 'epoch': '0.0678', 'num_input_tokens_seen': 5512571, 'train_runtime': '2788', 'train_tokens_per_second': '1977'} +{'loss': '0.8342', 'grad_norm': '2.304', 'learning_rate': '3.39e-05', 'epoch': '0.06783', 'num_input_tokens_seen': 5514618, 'train_runtime': '2789', 'train_tokens_per_second': '1977'} +{'loss': '1.807', 'grad_norm': '3.02', 'learning_rate': '3.391e-05', 'epoch': '0.06786', 'num_input_tokens_seen': 5516665, 'train_runtime': '2790', 'train_tokens_per_second': '1977'} +{'loss': '0.7491', 'grad_norm': '1.886', 'learning_rate': '3.392e-05', 'epoch': '0.06788', 'num_input_tokens_seen': 5518712, 'train_runtime': '2791', 'train_tokens_per_second': '1977'} +{'loss': '0.694', 'grad_norm': '2.44', 'learning_rate': '3.394e-05', 'epoch': '0.06791', 'num_input_tokens_seen': 5520759, 'train_runtime': '2792', 'train_tokens_per_second': '1977'} +{'loss': '1.346', 'grad_norm': '3.297', 'learning_rate': '3.395e-05', 'epoch': '0.06793', 'num_input_tokens_seen': 5522806, 'train_runtime': '2793', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '2.76', 'learning_rate': '3.396e-05', 'epoch': '0.06796', 'num_input_tokens_seen': 5524853, 'train_runtime': '2794', 'train_tokens_per_second': '1977'} +{'loss': '3.275', 'grad_norm': '3.663', 'learning_rate': '3.398e-05', 'epoch': '0.06798', 'num_input_tokens_seen': 5526900, 'train_runtime': '2796', 'train_tokens_per_second': '1977'} +{'loss': '1.495', 'grad_norm': '2.543', 'learning_rate': '3.399e-05', 'epoch': '0.06801', 'num_input_tokens_seen': 5528947, 'train_runtime': '2797', 'train_tokens_per_second': '1977'} +{'loss': '0.3601', 'grad_norm': '1.444', 'learning_rate': '3.4e-05', 'epoch': '0.06803', 'num_input_tokens_seen': 5530994, 'train_runtime': '2798', 'train_tokens_per_second': '1977'} +{'loss': '0.4051', 'grad_norm': '1.886', 'learning_rate': '3.401e-05', 'epoch': '0.06806', 'num_input_tokens_seen': 5533041, 'train_runtime': '2799', 'train_tokens_per_second': '1977'} +{'loss': '1.513', 'grad_norm': '2.928', 'learning_rate': '3.403e-05', 'epoch': '0.06808', 'num_input_tokens_seen': 5535088, 'train_runtime': '2800', 'train_tokens_per_second': '1977'} +{'loss': '1.364', 'grad_norm': '2.587', 'learning_rate': '3.404e-05', 'epoch': '0.06811', 'num_input_tokens_seen': 5537135, 'train_runtime': '2801', 'train_tokens_per_second': '1977'} +{'loss': '0.462', 'grad_norm': '1.991', 'learning_rate': '3.405e-05', 'epoch': '0.06813', 'num_input_tokens_seen': 5539182, 'train_runtime': '2802', 'train_tokens_per_second': '1977'} +{'loss': '1.338', 'grad_norm': '2.812', 'learning_rate': '3.406e-05', 'epoch': '0.06816', 'num_input_tokens_seen': 5541229, 'train_runtime': '2803', 'train_tokens_per_second': '1977'} +{'loss': '0.8497', 'grad_norm': '2.353', 'learning_rate': '3.408e-05', 'epoch': '0.06818', 'num_input_tokens_seen': 5543276, 'train_runtime': '2804', 'train_tokens_per_second': '1977'} +{'loss': '0.9339', 'grad_norm': '2.805', 'learning_rate': '3.409e-05', 'epoch': '0.06821', 'num_input_tokens_seen': 5545323, 'train_runtime': '2805', 'train_tokens_per_second': '1977'} +{'loss': '1.863', 'grad_norm': '3.408', 'learning_rate': '3.41e-05', 'epoch': '0.06823', 'num_input_tokens_seen': 5547370, 'train_runtime': '2806', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.947', 'learning_rate': '3.411e-05', 'epoch': '0.06826', 'num_input_tokens_seen': 5549417, 'train_runtime': '2807', 'train_tokens_per_second': '1977'} +{'loss': '0.4967', 'grad_norm': '2.25', 'learning_rate': '3.413e-05', 'epoch': '0.06828', 'num_input_tokens_seen': 5551464, 'train_runtime': '2808', 'train_tokens_per_second': '1977'} +{'loss': '0.5497', 'grad_norm': '2.113', 'learning_rate': '3.414e-05', 'epoch': '0.06831', 'num_input_tokens_seen': 5553511, 'train_runtime': '2809', 'train_tokens_per_second': '1977'} +{'loss': '1.442', 'grad_norm': '3.277', 'learning_rate': '3.415e-05', 'epoch': '0.06833', 'num_input_tokens_seen': 5555558, 'train_runtime': '2810', 'train_tokens_per_second': '1977'} +{'loss': '0.6692', 'grad_norm': '1.948', 'learning_rate': '3.416e-05', 'epoch': '0.06836', 'num_input_tokens_seen': 5557605, 'train_runtime': '2811', 'train_tokens_per_second': '1977'} +{'loss': '0.8113', 'grad_norm': '2.821', 'learning_rate': '3.418e-05', 'epoch': '0.06838', 'num_input_tokens_seen': 5559652, 'train_runtime': '2812', 'train_tokens_per_second': '1977'} +{'loss': '0.8283', 'grad_norm': '2.416', 'learning_rate': '3.419e-05', 'epoch': '0.06841', 'num_input_tokens_seen': 5561699, 'train_runtime': '2813', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '2.961', 'learning_rate': '3.42e-05', 'epoch': '0.06843', 'num_input_tokens_seen': 5563746, 'train_runtime': '2814', 'train_tokens_per_second': '1977'} +{'loss': '0.577', 'grad_norm': '2.401', 'learning_rate': '3.421e-05', 'epoch': '0.06846', 'num_input_tokens_seen': 5565793, 'train_runtime': '2815', 'train_tokens_per_second': '1977'} +{'loss': '0.5732', 'grad_norm': '2.113', 'learning_rate': '3.423e-05', 'epoch': '0.06848', 'num_input_tokens_seen': 5567840, 'train_runtime': '2816', 'train_tokens_per_second': '1977'} +{'loss': '0.7463', 'grad_norm': '2.154', 'learning_rate': '3.424e-05', 'epoch': '0.06851', 'num_input_tokens_seen': 5569887, 'train_runtime': '2817', 'train_tokens_per_second': '1977'} +{'loss': '1.186', 'grad_norm': '2.679', 'learning_rate': '3.425e-05', 'epoch': '0.06853', 'num_input_tokens_seen': 5571934, 'train_runtime': '2818', 'train_tokens_per_second': '1977'} +{'loss': '0.9452', 'grad_norm': '2.632', 'learning_rate': '3.426e-05', 'epoch': '0.06856', 'num_input_tokens_seen': 5573981, 'train_runtime': '2819', 'train_tokens_per_second': '1977'} +{'loss': '0.9545', 'grad_norm': '2.35', 'learning_rate': '3.428e-05', 'epoch': '0.06859', 'num_input_tokens_seen': 5576028, 'train_runtime': '2820', 'train_tokens_per_second': '1977'} +{'loss': '1.129', 'grad_norm': '2.532', 'learning_rate': '3.429e-05', 'epoch': '0.06861', 'num_input_tokens_seen': 5578075, 'train_runtime': '2821', 'train_tokens_per_second': '1977'} +{'loss': '1.66', 'grad_norm': '3.3', 'learning_rate': '3.43e-05', 'epoch': '0.06864', 'num_input_tokens_seen': 5580122, 'train_runtime': '2822', 'train_tokens_per_second': '1977'} +{'loss': '2.546', 'grad_norm': '3.665', 'learning_rate': '3.432e-05', 'epoch': '0.06866', 'num_input_tokens_seen': 5582169, 'train_runtime': '2823', 'train_tokens_per_second': '1977'} +{'loss': '0.942', 'grad_norm': '2.831', 'learning_rate': '3.433e-05', 'epoch': '0.06869', 'num_input_tokens_seen': 5584216, 'train_runtime': '2824', 'train_tokens_per_second': '1977'} +{'loss': '1.659', 'grad_norm': '3.015', 'learning_rate': '3.434e-05', 'epoch': '0.06871', 'num_input_tokens_seen': 5586263, 'train_runtime': '2826', 'train_tokens_per_second': '1977'} +{'loss': '0.9525', 'grad_norm': '2.448', 'learning_rate': '3.435e-05', 'epoch': '0.06874', 'num_input_tokens_seen': 5588310, 'train_runtime': '2827', 'train_tokens_per_second': '1977'} +{'loss': '2.073', 'grad_norm': '3.196', 'learning_rate': '3.437e-05', 'epoch': '0.06876', 'num_input_tokens_seen': 5590357, 'train_runtime': '2828', 'train_tokens_per_second': '1977'} +{'loss': '0.8086', 'grad_norm': '2.623', 'learning_rate': '3.438e-05', 'epoch': '0.06879', 'num_input_tokens_seen': 5592404, 'train_runtime': '2829', 'train_tokens_per_second': '1977'} +{'loss': '0.4766', 'grad_norm': '1.848', 'learning_rate': '3.439e-05', 'epoch': '0.06881', 'num_input_tokens_seen': 5594451, 'train_runtime': '2830', 'train_tokens_per_second': '1977'} +{'loss': '0.9355', 'grad_norm': '3.05', 'learning_rate': '3.44e-05', 'epoch': '0.06884', 'num_input_tokens_seen': 5596498, 'train_runtime': '2831', 'train_tokens_per_second': '1977'} +{'loss': '1.002', 'grad_norm': '2.34', 'learning_rate': '3.442e-05', 'epoch': '0.06886', 'num_input_tokens_seen': 5598545, 'train_runtime': '2832', 'train_tokens_per_second': '1977'} +{'loss': '0.9231', 'grad_norm': '2.373', 'learning_rate': '3.443e-05', 'epoch': '0.06889', 'num_input_tokens_seen': 5600592, 'train_runtime': '2833', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '2.048', 'learning_rate': '3.444e-05', 'epoch': '0.06891', 'num_input_tokens_seen': 5602639, 'train_runtime': '2834', 'train_tokens_per_second': '1977'} +{'loss': '0.5886', 'grad_norm': '2.042', 'learning_rate': '3.445e-05', 'epoch': '0.06894', 'num_input_tokens_seen': 5604686, 'train_runtime': '2835', 'train_tokens_per_second': '1977'} +{'loss': '0.5826', 'grad_norm': '1.79', 'learning_rate': '3.447e-05', 'epoch': '0.06896', 'num_input_tokens_seen': 5606733, 'train_runtime': '2836', 'train_tokens_per_second': '1977'} +{'loss': '0.5929', 'grad_norm': '2.034', 'learning_rate': '3.448e-05', 'epoch': '0.06899', 'num_input_tokens_seen': 5608780, 'train_runtime': '2837', 'train_tokens_per_second': '1977'} +{'loss': '0.4851', 'grad_norm': '1.753', 'learning_rate': '3.449e-05', 'epoch': '0.06901', 'num_input_tokens_seen': 5610827, 'train_runtime': '2838', 'train_tokens_per_second': '1977'} +{'loss': '0.9147', 'grad_norm': '2.401', 'learning_rate': '3.45e-05', 'epoch': '0.06904', 'num_input_tokens_seen': 5612874, 'train_runtime': '2839', 'train_tokens_per_second': '1977'} +{'loss': '1.068', 'grad_norm': '2.891', 'learning_rate': '3.452e-05', 'epoch': '0.06906', 'num_input_tokens_seen': 5614921, 'train_runtime': '2840', 'train_tokens_per_second': '1977'} +{'loss': '2.297', 'grad_norm': '3.294', 'learning_rate': '3.453e-05', 'epoch': '0.06909', 'num_input_tokens_seen': 5616968, 'train_runtime': '2841', 'train_tokens_per_second': '1977'} +{'loss': '0.432', 'grad_norm': '1.969', 'learning_rate': '3.454e-05', 'epoch': '0.06911', 'num_input_tokens_seen': 5619015, 'train_runtime': '2842', 'train_tokens_per_second': '1977'} +{'loss': '1.96', 'grad_norm': '4.254', 'learning_rate': '3.455e-05', 'epoch': '0.06914', 'num_input_tokens_seen': 5621062, 'train_runtime': '2843', 'train_tokens_per_second': '1977'} +{'loss': '0.4031', 'grad_norm': '1.843', 'learning_rate': '3.457e-05', 'epoch': '0.06916', 'num_input_tokens_seen': 5623109, 'train_runtime': '2844', 'train_tokens_per_second': '1977'} +{'loss': '0.8077', 'grad_norm': '1.996', 'learning_rate': '3.458e-05', 'epoch': '0.06919', 'num_input_tokens_seen': 5625156, 'train_runtime': '2845', 'train_tokens_per_second': '1977'} +{'loss': '1.899', 'grad_norm': '3.252', 'learning_rate': '3.459e-05', 'epoch': '0.06921', 'num_input_tokens_seen': 5627203, 'train_runtime': '2846', 'train_tokens_per_second': '1977'} +{'loss': '1.257', 'grad_norm': '2.783', 'learning_rate': '3.46e-05', 'epoch': '0.06924', 'num_input_tokens_seen': 5629250, 'train_runtime': '2847', 'train_tokens_per_second': '1977'} +{'loss': '0.5327', 'grad_norm': '2.337', 'learning_rate': '3.462e-05', 'epoch': '0.06927', 'num_input_tokens_seen': 5631297, 'train_runtime': '2848', 'train_tokens_per_second': '1977'} +{'loss': '0.8876', 'grad_norm': '2.566', 'learning_rate': '3.463e-05', 'epoch': '0.06929', 'num_input_tokens_seen': 5633344, 'train_runtime': '2849', 'train_tokens_per_second': '1977'} +{'loss': '0.7075', 'grad_norm': '2.281', 'learning_rate': '3.464e-05', 'epoch': '0.06932', 'num_input_tokens_seen': 5635391, 'train_runtime': '2850', 'train_tokens_per_second': '1977'} +{'loss': '0.4706', 'grad_norm': '2.166', 'learning_rate': '3.466e-05', 'epoch': '0.06934', 'num_input_tokens_seen': 5637438, 'train_runtime': '2851', 'train_tokens_per_second': '1977'} +{'loss': '2.338', 'grad_norm': '3.302', 'learning_rate': '3.467e-05', 'epoch': '0.06937', 'num_input_tokens_seen': 5639485, 'train_runtime': '2852', 'train_tokens_per_second': '1977'} +{'loss': '0.4532', 'grad_norm': '2.034', 'learning_rate': '3.468e-05', 'epoch': '0.06939', 'num_input_tokens_seen': 5641532, 'train_runtime': '2853', 'train_tokens_per_second': '1977'} +{'loss': '0.5133', 'grad_norm': '1.505', 'learning_rate': '3.469e-05', 'epoch': '0.06942', 'num_input_tokens_seen': 5643579, 'train_runtime': '2854', 'train_tokens_per_second': '1977'} +{'loss': '0.5102', 'grad_norm': '1.854', 'learning_rate': '3.471e-05', 'epoch': '0.06944', 'num_input_tokens_seen': 5645626, 'train_runtime': '2856', 'train_tokens_per_second': '1977'} +{'loss': '0.3947', 'grad_norm': '1.687', 'learning_rate': '3.472e-05', 'epoch': '0.06947', 'num_input_tokens_seen': 5647673, 'train_runtime': '2857', 'train_tokens_per_second': '1977'} +{'loss': '0.8704', 'grad_norm': '2.719', 'learning_rate': '3.473e-05', 'epoch': '0.06949', 'num_input_tokens_seen': 5649720, 'train_runtime': '2858', 'train_tokens_per_second': '1977'} +{'loss': '2.109', 'grad_norm': '3.633', 'learning_rate': '3.474e-05', 'epoch': '0.06952', 'num_input_tokens_seen': 5651767, 'train_runtime': '2859', 'train_tokens_per_second': '1977'} +{'loss': '0.9398', 'grad_norm': '3.38', 'learning_rate': '3.476e-05', 'epoch': '0.06954', 'num_input_tokens_seen': 5653814, 'train_runtime': '2860', 'train_tokens_per_second': '1977'} +{'loss': '1.704', 'grad_norm': '3.064', 'learning_rate': '3.477e-05', 'epoch': '0.06957', 'num_input_tokens_seen': 5655861, 'train_runtime': '2861', 'train_tokens_per_second': '1977'} +{'loss': '1.828', 'grad_norm': '3.443', 'learning_rate': '3.478e-05', 'epoch': '0.06959', 'num_input_tokens_seen': 5657908, 'train_runtime': '2862', 'train_tokens_per_second': '1977'} +{'loss': '2.358', 'grad_norm': '3.406', 'learning_rate': '3.479e-05', 'epoch': '0.06962', 'num_input_tokens_seen': 5659955, 'train_runtime': '2863', 'train_tokens_per_second': '1977'} +{'loss': '0.4973', 'grad_norm': '2.134', 'learning_rate': '3.481e-05', 'epoch': '0.06964', 'num_input_tokens_seen': 5662002, 'train_runtime': '2864', 'train_tokens_per_second': '1977'} +{'loss': '0.5097', 'grad_norm': '1.904', 'learning_rate': '3.482e-05', 'epoch': '0.06967', 'num_input_tokens_seen': 5664049, 'train_runtime': '2865', 'train_tokens_per_second': '1977'} +{'loss': '0.393', 'grad_norm': '1.613', 'learning_rate': '3.483e-05', 'epoch': '0.06969', 'num_input_tokens_seen': 5666096, 'train_runtime': '2866', 'train_tokens_per_second': '1977'} +{'loss': '1.216', 'grad_norm': '3.039', 'learning_rate': '3.484e-05', 'epoch': '0.06972', 'num_input_tokens_seen': 5668143, 'train_runtime': '2867', 'train_tokens_per_second': '1977'} +{'loss': '1.337', 'grad_norm': '2.891', 'learning_rate': '3.486e-05', 'epoch': '0.06974', 'num_input_tokens_seen': 5670190, 'train_runtime': '2868', 'train_tokens_per_second': '1977'} +{'loss': '1.184', 'grad_norm': '1.988', 'learning_rate': '3.487e-05', 'epoch': '0.06977', 'num_input_tokens_seen': 5672237, 'train_runtime': '2869', 'train_tokens_per_second': '1977'} +{'loss': '0.4454', 'grad_norm': '1.676', 'learning_rate': '3.488e-05', 'epoch': '0.06979', 'num_input_tokens_seen': 5674284, 'train_runtime': '2870', 'train_tokens_per_second': '1977'} +{'loss': '0.9752', 'grad_norm': '2.877', 'learning_rate': '3.489e-05', 'epoch': '0.06982', 'num_input_tokens_seen': 5676331, 'train_runtime': '2871', 'train_tokens_per_second': '1977'} +{'loss': '0.6038', 'grad_norm': '2.12', 'learning_rate': '3.491e-05', 'epoch': '0.06984', 'num_input_tokens_seen': 5678378, 'train_runtime': '2872', 'train_tokens_per_second': '1977'} +{'loss': '1.702', 'grad_norm': '2.947', 'learning_rate': '3.492e-05', 'epoch': '0.06987', 'num_input_tokens_seen': 5680425, 'train_runtime': '2873', 'train_tokens_per_second': '1977'} +{'loss': '2.093', 'grad_norm': '2.857', 'learning_rate': '3.493e-05', 'epoch': '0.06989', 'num_input_tokens_seen': 5682472, 'train_runtime': '2874', 'train_tokens_per_second': '1977'} +{'loss': '0.8087', 'grad_norm': '2.277', 'learning_rate': '3.494e-05', 'epoch': '0.06992', 'num_input_tokens_seen': 5684519, 'train_runtime': '2875', 'train_tokens_per_second': '1977'} +{'loss': '1.107', 'grad_norm': '2.66', 'learning_rate': '3.496e-05', 'epoch': '0.06994', 'num_input_tokens_seen': 5686566, 'train_runtime': '2876', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '2.755', 'learning_rate': '3.497e-05', 'epoch': '0.06997', 'num_input_tokens_seen': 5688613, 'train_runtime': '2877', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '2.428', 'learning_rate': '3.498e-05', 'epoch': '0.07', 'num_input_tokens_seen': 5690660, 'train_runtime': '2878', 'train_tokens_per_second': '1977'} +{'loss': '0.659', 'grad_norm': '2.485', 'learning_rate': '3.499e-05', 'epoch': '0.07002', 'num_input_tokens_seen': 5692707, 'train_runtime': '2879', 'train_tokens_per_second': '1977'} +{'loss': '1.413', 'grad_norm': '2.706', 'learning_rate': '3.501e-05', 'epoch': '0.07005', 'num_input_tokens_seen': 5694754, 'train_runtime': '2880', 'train_tokens_per_second': '1977'} +{'loss': '0.7772', 'grad_norm': '2.513', 'learning_rate': '3.502e-05', 'epoch': '0.07007', 'num_input_tokens_seen': 5696801, 'train_runtime': '2881', 'train_tokens_per_second': '1977'} +{'loss': '2.011', 'grad_norm': '3.528', 'learning_rate': '3.503e-05', 'epoch': '0.0701', 'num_input_tokens_seen': 5698848, 'train_runtime': '2882', 'train_tokens_per_second': '1977'} +{'loss': '1.711', 'grad_norm': '3.404', 'learning_rate': '3.505e-05', 'epoch': '0.07012', 'num_input_tokens_seen': 5700895, 'train_runtime': '2883', 'train_tokens_per_second': '1977'} +{'loss': '0.4215', 'grad_norm': '1.502', 'learning_rate': '3.506e-05', 'epoch': '0.07015', 'num_input_tokens_seen': 5702942, 'train_runtime': '2884', 'train_tokens_per_second': '1977'} +{'loss': '1.001', 'grad_norm': '2.619', 'learning_rate': '3.507e-05', 'epoch': '0.07017', 'num_input_tokens_seen': 5704989, 'train_runtime': '2885', 'train_tokens_per_second': '1977'} +{'loss': '1.053', 'grad_norm': '3.371', 'learning_rate': '3.508e-05', 'epoch': '0.0702', 'num_input_tokens_seen': 5707036, 'train_runtime': '2887', 'train_tokens_per_second': '1977'} +{'loss': '1.082', 'grad_norm': '2.62', 'learning_rate': '3.51e-05', 'epoch': '0.07022', 'num_input_tokens_seen': 5709083, 'train_runtime': '2888', 'train_tokens_per_second': '1977'} +{'loss': '0.3868', 'grad_norm': '1.746', 'learning_rate': '3.511e-05', 'epoch': '0.07025', 'num_input_tokens_seen': 5711130, 'train_runtime': '2889', 'train_tokens_per_second': '1977'} +{'loss': '0.7395', 'grad_norm': '2.449', 'learning_rate': '3.512e-05', 'epoch': '0.07027', 'num_input_tokens_seen': 5713177, 'train_runtime': '2890', 'train_tokens_per_second': '1977'} +{'loss': '0.8098', 'grad_norm': '2.328', 'learning_rate': '3.513e-05', 'epoch': '0.0703', 'num_input_tokens_seen': 5715224, 'train_runtime': '2891', 'train_tokens_per_second': '1977'} +{'loss': '0.7912', 'grad_norm': '2.479', 'learning_rate': '3.515e-05', 'epoch': '0.07032', 'num_input_tokens_seen': 5717271, 'train_runtime': '2892', 'train_tokens_per_second': '1977'} +{'loss': '0.6543', 'grad_norm': '2.475', 'learning_rate': '3.516e-05', 'epoch': '0.07035', 'num_input_tokens_seen': 5719318, 'train_runtime': '2893', 'train_tokens_per_second': '1977'} +{'loss': '0.893', 'grad_norm': '2.02', 'learning_rate': '3.517e-05', 'epoch': '0.07037', 'num_input_tokens_seen': 5721365, 'train_runtime': '2894', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '2.537', 'learning_rate': '3.518e-05', 'epoch': '0.0704', 'num_input_tokens_seen': 5723412, 'train_runtime': '2895', 'train_tokens_per_second': '1977'} +{'loss': '1.409', 'grad_norm': '3.227', 'learning_rate': '3.52e-05', 'epoch': '0.07042', 'num_input_tokens_seen': 5725459, 'train_runtime': '2896', 'train_tokens_per_second': '1977'} +{'loss': '0.33', 'grad_norm': '1.529', 'learning_rate': '3.521e-05', 'epoch': '0.07045', 'num_input_tokens_seen': 5727506, 'train_runtime': '2897', 'train_tokens_per_second': '1977'} +{'loss': '1.255', 'grad_norm': '2.891', 'learning_rate': '3.522e-05', 'epoch': '0.07047', 'num_input_tokens_seen': 5729553, 'train_runtime': '2898', 'train_tokens_per_second': '1977'} +{'loss': '2.494', 'grad_norm': '3.077', 'learning_rate': '3.523e-05', 'epoch': '0.0705', 'num_input_tokens_seen': 5731600, 'train_runtime': '2899', 'train_tokens_per_second': '1977'} +{'loss': '1.606', 'grad_norm': '2.637', 'learning_rate': '3.525e-05', 'epoch': '0.07052', 'num_input_tokens_seen': 5733647, 'train_runtime': '2900', 'train_tokens_per_second': '1977'} +{'loss': '0.4388', 'grad_norm': '1.447', 'learning_rate': '3.526e-05', 'epoch': '0.07055', 'num_input_tokens_seen': 5735694, 'train_runtime': '2901', 'train_tokens_per_second': '1977'} +{'loss': '1.045', 'grad_norm': '2.649', 'learning_rate': '3.527e-05', 'epoch': '0.07057', 'num_input_tokens_seen': 5737741, 'train_runtime': '2902', 'train_tokens_per_second': '1977'} +{'loss': '0.5593', 'grad_norm': '2.294', 'learning_rate': '3.528e-05', 'epoch': '0.0706', 'num_input_tokens_seen': 5739788, 'train_runtime': '2903', 'train_tokens_per_second': '1977'} +{'loss': '0.8675', 'grad_norm': '3.24', 'learning_rate': '3.53e-05', 'epoch': '0.07062', 'num_input_tokens_seen': 5741835, 'train_runtime': '2904', 'train_tokens_per_second': '1977'} +{'loss': '0.4198', 'grad_norm': '2', 'learning_rate': '3.531e-05', 'epoch': '0.07065', 'num_input_tokens_seen': 5743882, 'train_runtime': '2905', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '3.72', 'learning_rate': '3.532e-05', 'epoch': '0.07068', 'num_input_tokens_seen': 5745929, 'train_runtime': '2906', 'train_tokens_per_second': '1977'} +{'loss': '0.7591', 'grad_norm': '2.076', 'learning_rate': '3.533e-05', 'epoch': '0.0707', 'num_input_tokens_seen': 5747976, 'train_runtime': '2907', 'train_tokens_per_second': '1977'} +{'loss': '1.699', 'grad_norm': '3.029', 'learning_rate': '3.535e-05', 'epoch': '0.07073', 'num_input_tokens_seen': 5750023, 'train_runtime': '2908', 'train_tokens_per_second': '1977'} +{'loss': '0.6253', 'grad_norm': '2.593', 'learning_rate': '3.536e-05', 'epoch': '0.07075', 'num_input_tokens_seen': 5752070, 'train_runtime': '2909', 'train_tokens_per_second': '1977'} +{'loss': '0.3309', 'grad_norm': '1.765', 'learning_rate': '3.537e-05', 'epoch': '0.07078', 'num_input_tokens_seen': 5754117, 'train_runtime': '2910', 'train_tokens_per_second': '1977'} +{'loss': '0.9076', 'grad_norm': '2.54', 'learning_rate': '3.539e-05', 'epoch': '0.0708', 'num_input_tokens_seen': 5756164, 'train_runtime': '2911', 'train_tokens_per_second': '1977'} +{'loss': '0.4878', 'grad_norm': '1.595', 'learning_rate': '3.54e-05', 'epoch': '0.07083', 'num_input_tokens_seen': 5758211, 'train_runtime': '2912', 'train_tokens_per_second': '1977'} +{'loss': '1.842', 'grad_norm': '3.515', 'learning_rate': '3.541e-05', 'epoch': '0.07085', 'num_input_tokens_seen': 5760258, 'train_runtime': '2913', 'train_tokens_per_second': '1977'} +{'loss': '0.86', 'grad_norm': '2.322', 'learning_rate': '3.542e-05', 'epoch': '0.07088', 'num_input_tokens_seen': 5762305, 'train_runtime': '2914', 'train_tokens_per_second': '1977'} +{'loss': '0.8723', 'grad_norm': '2.412', 'learning_rate': '3.544e-05', 'epoch': '0.0709', 'num_input_tokens_seen': 5764352, 'train_runtime': '2916', 'train_tokens_per_second': '1977'} +{'loss': '1.785', 'grad_norm': '4.027', 'learning_rate': '3.545e-05', 'epoch': '0.07093', 'num_input_tokens_seen': 5766399, 'train_runtime': '2917', 'train_tokens_per_second': '1977'} +{'loss': '0.8236', 'grad_norm': '2.104', 'learning_rate': '3.546e-05', 'epoch': '0.07095', 'num_input_tokens_seen': 5768446, 'train_runtime': '2918', 'train_tokens_per_second': '1977'} +{'loss': '1.642', 'grad_norm': '2.177', 'learning_rate': '3.547e-05', 'epoch': '0.07098', 'num_input_tokens_seen': 5770493, 'train_runtime': '2919', 'train_tokens_per_second': '1977'} +{'loss': '1.354', 'grad_norm': '2.435', 'learning_rate': '3.549e-05', 'epoch': '0.071', 'num_input_tokens_seen': 5772540, 'train_runtime': '2920', 'train_tokens_per_second': '1977'} +{'loss': '1.139', 'grad_norm': '3.088', 'learning_rate': '3.55e-05', 'epoch': '0.07103', 'num_input_tokens_seen': 5774587, 'train_runtime': '2921', 'train_tokens_per_second': '1977'} +{'loss': '0.9588', 'grad_norm': '2.32', 'learning_rate': '3.551e-05', 'epoch': '0.07105', 'num_input_tokens_seen': 5776634, 'train_runtime': '2922', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '2.531', 'learning_rate': '3.552e-05', 'epoch': '0.07108', 'num_input_tokens_seen': 5778681, 'train_runtime': '2923', 'train_tokens_per_second': '1977'} +{'loss': '0.3462', 'grad_norm': '1.793', 'learning_rate': '3.554e-05', 'epoch': '0.0711', 'num_input_tokens_seen': 5780728, 'train_runtime': '2924', 'train_tokens_per_second': '1977'} +{'loss': '0.4616', 'grad_norm': '2.124', 'learning_rate': '3.555e-05', 'epoch': '0.07113', 'num_input_tokens_seen': 5782775, 'train_runtime': '2925', 'train_tokens_per_second': '1977'} +{'loss': '0.4705', 'grad_norm': '1.881', 'learning_rate': '3.556e-05', 'epoch': '0.07115', 'num_input_tokens_seen': 5784822, 'train_runtime': '2926', 'train_tokens_per_second': '1977'} +{'loss': '0.9339', 'grad_norm': '2.76', 'learning_rate': '3.557e-05', 'epoch': '0.07118', 'num_input_tokens_seen': 5786869, 'train_runtime': '2927', 'train_tokens_per_second': '1977'} +{'loss': '0.7721', 'grad_norm': '2.553', 'learning_rate': '3.559e-05', 'epoch': '0.0712', 'num_input_tokens_seen': 5788916, 'train_runtime': '2928', 'train_tokens_per_second': '1977'} +{'loss': '1.187', 'grad_norm': '2.749', 'learning_rate': '3.56e-05', 'epoch': '0.07123', 'num_input_tokens_seen': 5790963, 'train_runtime': '2929', 'train_tokens_per_second': '1977'} +{'loss': '0.6775', 'grad_norm': '2.267', 'learning_rate': '3.561e-05', 'epoch': '0.07125', 'num_input_tokens_seen': 5793010, 'train_runtime': '2930', 'train_tokens_per_second': '1977'} +{'loss': '1.079', 'grad_norm': '2.52', 'learning_rate': '3.562e-05', 'epoch': '0.07128', 'num_input_tokens_seen': 5795057, 'train_runtime': '2931', 'train_tokens_per_second': '1977'} +{'loss': '0.9561', 'grad_norm': '2.849', 'learning_rate': '3.564e-05', 'epoch': '0.0713', 'num_input_tokens_seen': 5797104, 'train_runtime': '2932', 'train_tokens_per_second': '1977'} +{'loss': '0.9437', 'grad_norm': '3.311', 'learning_rate': '3.565e-05', 'epoch': '0.07133', 'num_input_tokens_seen': 5799151, 'train_runtime': '2933', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '3.192', 'learning_rate': '3.566e-05', 'epoch': '0.07135', 'num_input_tokens_seen': 5801198, 'train_runtime': '2934', 'train_tokens_per_second': '1977'} +{'loss': '0.4197', 'grad_norm': '2.395', 'learning_rate': '3.567e-05', 'epoch': '0.07138', 'num_input_tokens_seen': 5803245, 'train_runtime': '2935', 'train_tokens_per_second': '1977'} +{'loss': '1.121', 'grad_norm': '2.447', 'learning_rate': '3.569e-05', 'epoch': '0.07141', 'num_input_tokens_seen': 5805292, 'train_runtime': '2936', 'train_tokens_per_second': '1977'} +{'loss': '0.4719', 'grad_norm': '1.563', 'learning_rate': '3.57e-05', 'epoch': '0.07143', 'num_input_tokens_seen': 5807339, 'train_runtime': '2937', 'train_tokens_per_second': '1977'} +{'loss': '0.4646', 'grad_norm': '1.681', 'learning_rate': '3.571e-05', 'epoch': '0.07146', 'num_input_tokens_seen': 5809386, 'train_runtime': '2938', 'train_tokens_per_second': '1977'} +{'loss': '0.919', 'grad_norm': '2.262', 'learning_rate': '3.573e-05', 'epoch': '0.07148', 'num_input_tokens_seen': 5811433, 'train_runtime': '2939', 'train_tokens_per_second': '1977'} +{'loss': '0.6935', 'grad_norm': '2.005', 'learning_rate': '3.574e-05', 'epoch': '0.07151', 'num_input_tokens_seen': 5813480, 'train_runtime': '2940', 'train_tokens_per_second': '1977'} +{'loss': '2.613', 'grad_norm': '3.663', 'learning_rate': '3.575e-05', 'epoch': '0.07153', 'num_input_tokens_seen': 5815527, 'train_runtime': '2941', 'train_tokens_per_second': '1977'} +{'loss': '2.237', 'grad_norm': '3.623', 'learning_rate': '3.576e-05', 'epoch': '0.07156', 'num_input_tokens_seen': 5817574, 'train_runtime': '2942', 'train_tokens_per_second': '1977'} +{'loss': '0.5857', 'grad_norm': '2.126', 'learning_rate': '3.578e-05', 'epoch': '0.07158', 'num_input_tokens_seen': 5819621, 'train_runtime': '2943', 'train_tokens_per_second': '1977'} +{'loss': '1.886', 'grad_norm': '2.951', 'learning_rate': '3.579e-05', 'epoch': '0.07161', 'num_input_tokens_seen': 5821668, 'train_runtime': '2944', 'train_tokens_per_second': '1977'} +{'loss': '0.681', 'grad_norm': '2.046', 'learning_rate': '3.58e-05', 'epoch': '0.07163', 'num_input_tokens_seen': 5823715, 'train_runtime': '2946', 'train_tokens_per_second': '1977'} +{'loss': '0.3976', 'grad_norm': '1.596', 'learning_rate': '3.581e-05', 'epoch': '0.07166', 'num_input_tokens_seen': 5825762, 'train_runtime': '2947', 'train_tokens_per_second': '1977'} +{'loss': '0.4294', 'grad_norm': '1.641', 'learning_rate': '3.583e-05', 'epoch': '0.07168', 'num_input_tokens_seen': 5827809, 'train_runtime': '2948', 'train_tokens_per_second': '1977'} +{'loss': '0.6898', 'grad_norm': '2.001', 'learning_rate': '3.584e-05', 'epoch': '0.07171', 'num_input_tokens_seen': 5829856, 'train_runtime': '2949', 'train_tokens_per_second': '1977'} +{'loss': '2.102', 'grad_norm': '3.198', 'learning_rate': '3.585e-05', 'epoch': '0.07173', 'num_input_tokens_seen': 5831903, 'train_runtime': '2950', 'train_tokens_per_second': '1977'} +{'loss': '0.4092', 'grad_norm': '1.646', 'learning_rate': '3.586e-05', 'epoch': '0.07176', 'num_input_tokens_seen': 5833950, 'train_runtime': '2951', 'train_tokens_per_second': '1977'} +{'loss': '1.757', 'grad_norm': '3.204', 'learning_rate': '3.588e-05', 'epoch': '0.07178', 'num_input_tokens_seen': 5835997, 'train_runtime': '2952', 'train_tokens_per_second': '1977'} +{'loss': '0.4563', 'grad_norm': '1.6', 'learning_rate': '3.589e-05', 'epoch': '0.07181', 'num_input_tokens_seen': 5838044, 'train_runtime': '2953', 'train_tokens_per_second': '1977'} +{'loss': '0.4413', 'grad_norm': '1.718', 'learning_rate': '3.59e-05', 'epoch': '0.07183', 'num_input_tokens_seen': 5840091, 'train_runtime': '2954', 'train_tokens_per_second': '1977'} +{'loss': '1.343', 'grad_norm': '2.037', 'learning_rate': '3.591e-05', 'epoch': '0.07186', 'num_input_tokens_seen': 5842138, 'train_runtime': '2955', 'train_tokens_per_second': '1977'} +{'loss': '0.7802', 'grad_norm': '2.956', 'learning_rate': '3.593e-05', 'epoch': '0.07188', 'num_input_tokens_seen': 5844185, 'train_runtime': '2956', 'train_tokens_per_second': '1977'} +{'loss': '0.5232', 'grad_norm': '1.998', 'learning_rate': '3.594e-05', 'epoch': '0.07191', 'num_input_tokens_seen': 5846232, 'train_runtime': '2957', 'train_tokens_per_second': '1977'} +{'loss': '0.4341', 'grad_norm': '1.527', 'learning_rate': '3.595e-05', 'epoch': '0.07193', 'num_input_tokens_seen': 5848279, 'train_runtime': '2958', 'train_tokens_per_second': '1977'} +{'loss': '1.806', 'grad_norm': '2.582', 'learning_rate': '3.596e-05', 'epoch': '0.07196', 'num_input_tokens_seen': 5850326, 'train_runtime': '2959', 'train_tokens_per_second': '1977'} +{'loss': '0.4662', 'grad_norm': '1.641', 'learning_rate': '3.598e-05', 'epoch': '0.07198', 'num_input_tokens_seen': 5852373, 'train_runtime': '2960', 'train_tokens_per_second': '1977'} +{'loss': '0.5657', 'grad_norm': '2.124', 'learning_rate': '3.599e-05', 'epoch': '0.07201', 'num_input_tokens_seen': 5854420, 'train_runtime': '2961', 'train_tokens_per_second': '1977'} +{'loss': '0.4073', 'grad_norm': '1.685', 'learning_rate': '3.6e-05', 'epoch': '0.07203', 'num_input_tokens_seen': 5856467, 'train_runtime': '2962', 'train_tokens_per_second': '1977'} +{'loss': '0.4617', 'grad_norm': '1.952', 'learning_rate': '3.601e-05', 'epoch': '0.07206', 'num_input_tokens_seen': 5858514, 'train_runtime': '2963', 'train_tokens_per_second': '1977'} +{'loss': '0.7898', 'grad_norm': '1.98', 'learning_rate': '3.603e-05', 'epoch': '0.07209', 'num_input_tokens_seen': 5860561, 'train_runtime': '2964', 'train_tokens_per_second': '1977'} +{'loss': '0.5313', 'grad_norm': '2.068', 'learning_rate': '3.604e-05', 'epoch': '0.07211', 'num_input_tokens_seen': 5862608, 'train_runtime': '2965', 'train_tokens_per_second': '1977'} +{'loss': '0.5809', 'grad_norm': '1.799', 'learning_rate': '3.605e-05', 'epoch': '0.07214', 'num_input_tokens_seen': 5864655, 'train_runtime': '2966', 'train_tokens_per_second': '1977'} +{'loss': '0.3891', 'grad_norm': '1.863', 'learning_rate': '3.606e-05', 'epoch': '0.07216', 'num_input_tokens_seen': 5866702, 'train_runtime': '2967', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '2.572', 'learning_rate': '3.608e-05', 'epoch': '0.07219', 'num_input_tokens_seen': 5868749, 'train_runtime': '2968', 'train_tokens_per_second': '1977'} +{'loss': '0.7402', 'grad_norm': '2.259', 'learning_rate': '3.609e-05', 'epoch': '0.07221', 'num_input_tokens_seen': 5870796, 'train_runtime': '2969', 'train_tokens_per_second': '1977'} +{'loss': '0.6693', 'grad_norm': '2.066', 'learning_rate': '3.61e-05', 'epoch': '0.07224', 'num_input_tokens_seen': 5872843, 'train_runtime': '2970', 'train_tokens_per_second': '1977'} +{'loss': '0.9004', 'grad_norm': '2.421', 'learning_rate': '3.612e-05', 'epoch': '0.07226', 'num_input_tokens_seen': 5874890, 'train_runtime': '2971', 'train_tokens_per_second': '1977'} +{'loss': '0.3919', 'grad_norm': '1.623', 'learning_rate': '3.613e-05', 'epoch': '0.07229', 'num_input_tokens_seen': 5876937, 'train_runtime': '2972', 'train_tokens_per_second': '1977'} +{'loss': '0.4451', 'grad_norm': '1.866', 'learning_rate': '3.614e-05', 'epoch': '0.07231', 'num_input_tokens_seen': 5878984, 'train_runtime': '2973', 'train_tokens_per_second': '1977'} +{'loss': '0.4033', 'grad_norm': '1.772', 'learning_rate': '3.615e-05', 'epoch': '0.07234', 'num_input_tokens_seen': 5881031, 'train_runtime': '2975', 'train_tokens_per_second': '1977'} +{'loss': '0.843', 'grad_norm': '2.368', 'learning_rate': '3.617e-05', 'epoch': '0.07236', 'num_input_tokens_seen': 5883078, 'train_runtime': '2976', 'train_tokens_per_second': '1977'} +{'loss': '1.905', 'grad_norm': '3.203', 'learning_rate': '3.618e-05', 'epoch': '0.07239', 'num_input_tokens_seen': 5885125, 'train_runtime': '2977', 'train_tokens_per_second': '1977'} +{'loss': '0.9787', 'grad_norm': '3.16', 'learning_rate': '3.619e-05', 'epoch': '0.07241', 'num_input_tokens_seen': 5887172, 'train_runtime': '2978', 'train_tokens_per_second': '1977'} +{'loss': '1.901', 'grad_norm': '3.594', 'learning_rate': '3.62e-05', 'epoch': '0.07244', 'num_input_tokens_seen': 5889219, 'train_runtime': '2979', 'train_tokens_per_second': '1977'} +{'loss': '0.5815', 'grad_norm': '2.142', 'learning_rate': '3.622e-05', 'epoch': '0.07246', 'num_input_tokens_seen': 5891266, 'train_runtime': '2980', 'train_tokens_per_second': '1977'} +{'loss': '0.4691', 'grad_norm': '2.04', 'learning_rate': '3.623e-05', 'epoch': '0.07249', 'num_input_tokens_seen': 5893313, 'train_runtime': '2981', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '2.689', 'learning_rate': '3.624e-05', 'epoch': '0.07251', 'num_input_tokens_seen': 5895360, 'train_runtime': '2982', 'train_tokens_per_second': '1977'} +{'loss': '0.4779', 'grad_norm': '1.573', 'learning_rate': '3.625e-05', 'epoch': '0.07254', 'num_input_tokens_seen': 5897407, 'train_runtime': '2983', 'train_tokens_per_second': '1977'} +{'loss': '1.298', 'grad_norm': '3.208', 'learning_rate': '3.627e-05', 'epoch': '0.07256', 'num_input_tokens_seen': 5899454, 'train_runtime': '2984', 'train_tokens_per_second': '1977'} +{'loss': '1.672', 'grad_norm': '3.413', 'learning_rate': '3.628e-05', 'epoch': '0.07259', 'num_input_tokens_seen': 5901501, 'train_runtime': '2985', 'train_tokens_per_second': '1977'} +{'loss': '1.989', 'grad_norm': '3.401', 'learning_rate': '3.629e-05', 'epoch': '0.07261', 'num_input_tokens_seen': 5903548, 'train_runtime': '2986', 'train_tokens_per_second': '1977'} +{'loss': '0.6926', 'grad_norm': '2.286', 'learning_rate': '3.63e-05', 'epoch': '0.07264', 'num_input_tokens_seen': 5905595, 'train_runtime': '2987', 'train_tokens_per_second': '1977'} +{'loss': '0.7157', 'grad_norm': '2.782', 'learning_rate': '3.632e-05', 'epoch': '0.07266', 'num_input_tokens_seen': 5907642, 'train_runtime': '2988', 'train_tokens_per_second': '1977'} +{'loss': '1.566', 'grad_norm': '3.596', 'learning_rate': '3.633e-05', 'epoch': '0.07269', 'num_input_tokens_seen': 5909689, 'train_runtime': '2989', 'train_tokens_per_second': '1977'} +{'loss': '0.712', 'grad_norm': '2.024', 'learning_rate': '3.634e-05', 'epoch': '0.07271', 'num_input_tokens_seen': 5911736, 'train_runtime': '2990', 'train_tokens_per_second': '1977'} +{'loss': '0.5024', 'grad_norm': '1.758', 'learning_rate': '3.635e-05', 'epoch': '0.07274', 'num_input_tokens_seen': 5913783, 'train_runtime': '2991', 'train_tokens_per_second': '1977'} +{'loss': '0.4247', 'grad_norm': '1.757', 'learning_rate': '3.637e-05', 'epoch': '0.07276', 'num_input_tokens_seen': 5915830, 'train_runtime': '2992', 'train_tokens_per_second': '1977'} +{'loss': '0.8884', 'grad_norm': '2.644', 'learning_rate': '3.638e-05', 'epoch': '0.07279', 'num_input_tokens_seen': 5917877, 'train_runtime': '2993', 'train_tokens_per_second': '1977'} +{'loss': '1.289', 'grad_norm': '3.185', 'learning_rate': '3.639e-05', 'epoch': '0.07282', 'num_input_tokens_seen': 5919924, 'train_runtime': '2994', 'train_tokens_per_second': '1977'} +{'loss': '0.7401', 'grad_norm': '2.499', 'learning_rate': '3.64e-05', 'epoch': '0.07284', 'num_input_tokens_seen': 5921971, 'train_runtime': '2995', 'train_tokens_per_second': '1977'} +{'loss': '0.567', 'grad_norm': '1.955', 'learning_rate': '3.642e-05', 'epoch': '0.07287', 'num_input_tokens_seen': 5924018, 'train_runtime': '2996', 'train_tokens_per_second': '1977'} +{'loss': '2.465', 'grad_norm': '3.262', 'learning_rate': '3.643e-05', 'epoch': '0.07289', 'num_input_tokens_seen': 5926065, 'train_runtime': '2997', 'train_tokens_per_second': '1977'} +{'loss': '0.4929', 'grad_norm': '1.733', 'learning_rate': '3.644e-05', 'epoch': '0.07292', 'num_input_tokens_seen': 5928112, 'train_runtime': '2998', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '2.543', 'learning_rate': '3.646e-05', 'epoch': '0.07294', 'num_input_tokens_seen': 5930159, 'train_runtime': '2999', 'train_tokens_per_second': '1977'} +{'loss': '0.44', 'grad_norm': '1.928', 'learning_rate': '3.647e-05', 'epoch': '0.07297', 'num_input_tokens_seen': 5932206, 'train_runtime': '3000', 'train_tokens_per_second': '1977'} +{'loss': '1.633', 'grad_norm': '2.724', 'learning_rate': '3.648e-05', 'epoch': '0.07299', 'num_input_tokens_seen': 5934253, 'train_runtime': '3001', 'train_tokens_per_second': '1977'} +{'loss': '0.8813', 'grad_norm': '2.464', 'learning_rate': '3.649e-05', 'epoch': '0.07302', 'num_input_tokens_seen': 5936300, 'train_runtime': '3002', 'train_tokens_per_second': '1977'} +{'loss': '0.651', 'grad_norm': '1.892', 'learning_rate': '3.651e-05', 'epoch': '0.07304', 'num_input_tokens_seen': 5938347, 'train_runtime': '3003', 'train_tokens_per_second': '1977'} +{'loss': '0.655', 'grad_norm': '2.372', 'learning_rate': '3.652e-05', 'epoch': '0.07307', 'num_input_tokens_seen': 5940394, 'train_runtime': '3004', 'train_tokens_per_second': '1977'} +{'loss': '0.9088', 'grad_norm': '2.772', 'learning_rate': '3.653e-05', 'epoch': '0.07309', 'num_input_tokens_seen': 5942441, 'train_runtime': '3006', 'train_tokens_per_second': '1977'} +{'loss': '0.3658', 'grad_norm': '1.815', 'learning_rate': '3.654e-05', 'epoch': '0.07312', 'num_input_tokens_seen': 5944488, 'train_runtime': '3007', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '3.342', 'learning_rate': '3.656e-05', 'epoch': '0.07314', 'num_input_tokens_seen': 5946535, 'train_runtime': '3008', 'train_tokens_per_second': '1977'} +{'loss': '1.559', 'grad_norm': '3.083', 'learning_rate': '3.657e-05', 'epoch': '0.07317', 'num_input_tokens_seen': 5948582, 'train_runtime': '3009', 'train_tokens_per_second': '1977'} +{'loss': '1.841', 'grad_norm': '5.212', 'learning_rate': '3.658e-05', 'epoch': '0.07319', 'num_input_tokens_seen': 5950629, 'train_runtime': '3010', 'train_tokens_per_second': '1977'} +{'loss': '0.8776', 'grad_norm': '2.271', 'learning_rate': '3.659e-05', 'epoch': '0.07322', 'num_input_tokens_seen': 5952676, 'train_runtime': '3011', 'train_tokens_per_second': '1977'} +{'loss': '0.522', 'grad_norm': '1.894', 'learning_rate': '3.661e-05', 'epoch': '0.07324', 'num_input_tokens_seen': 5954723, 'train_runtime': '3012', 'train_tokens_per_second': '1977'} +{'loss': '1.812', 'grad_norm': '3.531', 'learning_rate': '3.662e-05', 'epoch': '0.07327', 'num_input_tokens_seen': 5956770, 'train_runtime': '3013', 'train_tokens_per_second': '1977'} +{'loss': '1.874', 'grad_norm': '3.254', 'learning_rate': '3.663e-05', 'epoch': '0.07329', 'num_input_tokens_seen': 5958817, 'train_runtime': '3014', 'train_tokens_per_second': '1977'} +{'loss': '0.6977', 'grad_norm': '1.933', 'learning_rate': '3.664e-05', 'epoch': '0.07332', 'num_input_tokens_seen': 5960864, 'train_runtime': '3015', 'train_tokens_per_second': '1977'} +{'loss': '0.3758', 'grad_norm': '1.828', 'learning_rate': '3.666e-05', 'epoch': '0.07334', 'num_input_tokens_seen': 5962911, 'train_runtime': '3016', 'train_tokens_per_second': '1977'} +{'loss': '0.7537', 'grad_norm': '2.04', 'learning_rate': '3.667e-05', 'epoch': '0.07337', 'num_input_tokens_seen': 5964958, 'train_runtime': '3017', 'train_tokens_per_second': '1977'} +{'loss': '0.759', 'grad_norm': '1.971', 'learning_rate': '3.668e-05', 'epoch': '0.07339', 'num_input_tokens_seen': 5967005, 'train_runtime': '3018', 'train_tokens_per_second': '1977'} +{'loss': '2.233', 'grad_norm': '4.106', 'learning_rate': '3.669e-05', 'epoch': '0.07342', 'num_input_tokens_seen': 5969052, 'train_runtime': '3019', 'train_tokens_per_second': '1977'} +{'loss': '0.3534', 'grad_norm': '1.505', 'learning_rate': '3.671e-05', 'epoch': '0.07344', 'num_input_tokens_seen': 5971099, 'train_runtime': '3020', 'train_tokens_per_second': '1977'} +{'loss': '0.7211', 'grad_norm': '2.106', 'learning_rate': '3.672e-05', 'epoch': '0.07347', 'num_input_tokens_seen': 5973146, 'train_runtime': '3021', 'train_tokens_per_second': '1977'} +{'loss': '0.6495', 'grad_norm': '2.18', 'learning_rate': '3.673e-05', 'epoch': '0.07349', 'num_input_tokens_seen': 5975193, 'train_runtime': '3022', 'train_tokens_per_second': '1977'} +{'loss': '1.73', 'grad_norm': '3.236', 'learning_rate': '3.674e-05', 'epoch': '0.07352', 'num_input_tokens_seen': 5977240, 'train_runtime': '3023', 'train_tokens_per_second': '1977'} +{'loss': '2.961', 'grad_norm': '4.021', 'learning_rate': '3.676e-05', 'epoch': '0.07355', 'num_input_tokens_seen': 5979287, 'train_runtime': '3024', 'train_tokens_per_second': '1977'} +{'loss': '0.4942', 'grad_norm': '1.839', 'learning_rate': '3.677e-05', 'epoch': '0.07357', 'num_input_tokens_seen': 5981334, 'train_runtime': '3025', 'train_tokens_per_second': '1977'} +{'loss': '0.5084', 'grad_norm': '1.49', 'learning_rate': '3.678e-05', 'epoch': '0.0736', 'num_input_tokens_seen': 5983381, 'train_runtime': '3026', 'train_tokens_per_second': '1977'} +{'loss': '0.4276', 'grad_norm': '1.382', 'learning_rate': '3.68e-05', 'epoch': '0.07362', 'num_input_tokens_seen': 5985428, 'train_runtime': '3027', 'train_tokens_per_second': '1977'} +{'loss': '2.034', 'grad_norm': '3.195', 'learning_rate': '3.681e-05', 'epoch': '0.07365', 'num_input_tokens_seen': 5987475, 'train_runtime': '3028', 'train_tokens_per_second': '1977'} +{'loss': '0.8211', 'grad_norm': '1.863', 'learning_rate': '3.682e-05', 'epoch': '0.07367', 'num_input_tokens_seen': 5989522, 'train_runtime': '3029', 'train_tokens_per_second': '1977'} +{'loss': '0.3957', 'grad_norm': '1.758', 'learning_rate': '3.683e-05', 'epoch': '0.0737', 'num_input_tokens_seen': 5991569, 'train_runtime': '3030', 'train_tokens_per_second': '1977'} +{'loss': '1.053', 'grad_norm': '2.457', 'learning_rate': '3.685e-05', 'epoch': '0.07372', 'num_input_tokens_seen': 5993616, 'train_runtime': '3031', 'train_tokens_per_second': '1977'} +{'loss': '1.023', 'grad_norm': '2.642', 'learning_rate': '3.686e-05', 'epoch': '0.07375', 'num_input_tokens_seen': 5995663, 'train_runtime': '3032', 'train_tokens_per_second': '1977'} +{'loss': '0.4946', 'grad_norm': '1.787', 'learning_rate': '3.687e-05', 'epoch': '0.07377', 'num_input_tokens_seen': 5997710, 'train_runtime': '3033', 'train_tokens_per_second': '1977'} +{'loss': '0.8299', 'grad_norm': '2.713', 'learning_rate': '3.688e-05', 'epoch': '0.0738', 'num_input_tokens_seen': 5999757, 'train_runtime': '3035', 'train_tokens_per_second': '1977'} +{'loss': '0.8264', 'grad_norm': '2.383', 'learning_rate': '3.69e-05', 'epoch': '0.07382', 'num_input_tokens_seen': 6001804, 'train_runtime': '3036', 'train_tokens_per_second': '1977'} +{'loss': '0.955', 'grad_norm': '2.6', 'learning_rate': '3.691e-05', 'epoch': '0.07385', 'num_input_tokens_seen': 6003851, 'train_runtime': '3037', 'train_tokens_per_second': '1977'} +{'loss': '0.4265', 'grad_norm': '1.583', 'learning_rate': '3.692e-05', 'epoch': '0.07387', 'num_input_tokens_seen': 6005898, 'train_runtime': '3038', 'train_tokens_per_second': '1977'} +{'loss': '1.033', 'grad_norm': '4.031', 'learning_rate': '3.693e-05', 'epoch': '0.0739', 'num_input_tokens_seen': 6007945, 'train_runtime': '3039', 'train_tokens_per_second': '1977'} +{'loss': '0.8313', 'grad_norm': '1.999', 'learning_rate': '3.695e-05', 'epoch': '0.07392', 'num_input_tokens_seen': 6009992, 'train_runtime': '3040', 'train_tokens_per_second': '1977'} +{'loss': '0.9093', 'grad_norm': '2.729', 'learning_rate': '3.696e-05', 'epoch': '0.07395', 'num_input_tokens_seen': 6012039, 'train_runtime': '3041', 'train_tokens_per_second': '1977'} +{'loss': '1.445', 'grad_norm': '2.662', 'learning_rate': '3.697e-05', 'epoch': '0.07397', 'num_input_tokens_seen': 6014086, 'train_runtime': '3042', 'train_tokens_per_second': '1977'} +{'loss': '0.998', 'grad_norm': '2.316', 'learning_rate': '3.698e-05', 'epoch': '0.074', 'num_input_tokens_seen': 6016133, 'train_runtime': '3043', 'train_tokens_per_second': '1977'} +{'loss': '0.6632', 'grad_norm': '2.223', 'learning_rate': '3.7e-05', 'epoch': '0.07402', 'num_input_tokens_seen': 6018180, 'train_runtime': '3044', 'train_tokens_per_second': '1977'} +{'loss': '0.5051', 'grad_norm': '1.77', 'learning_rate': '3.701e-05', 'epoch': '0.07405', 'num_input_tokens_seen': 6020227, 'train_runtime': '3045', 'train_tokens_per_second': '1977'} +{'loss': '0.451', 'grad_norm': '1.685', 'learning_rate': '3.702e-05', 'epoch': '0.07407', 'num_input_tokens_seen': 6022274, 'train_runtime': '3046', 'train_tokens_per_second': '1977'} +{'loss': '0.4708', 'grad_norm': '1.832', 'learning_rate': '3.703e-05', 'epoch': '0.0741', 'num_input_tokens_seen': 6024321, 'train_runtime': '3047', 'train_tokens_per_second': '1977'} +{'loss': '1.028', 'grad_norm': '2.123', 'learning_rate': '3.705e-05', 'epoch': '0.07412', 'num_input_tokens_seen': 6026368, 'train_runtime': '3048', 'train_tokens_per_second': '1977'} +{'loss': '1.266', 'grad_norm': '2.599', 'learning_rate': '3.706e-05', 'epoch': '0.07415', 'num_input_tokens_seen': 6028415, 'train_runtime': '3049', 'train_tokens_per_second': '1977'} +{'loss': '0.3606', 'grad_norm': '1.72', 'learning_rate': '3.707e-05', 'epoch': '0.07417', 'num_input_tokens_seen': 6030462, 'train_runtime': '3050', 'train_tokens_per_second': '1977'} +{'loss': '1.395', 'grad_norm': '2.825', 'learning_rate': '3.708e-05', 'epoch': '0.0742', 'num_input_tokens_seen': 6032509, 'train_runtime': '3051', 'train_tokens_per_second': '1977'} +{'loss': '0.8483', 'grad_norm': '2.53', 'learning_rate': '3.71e-05', 'epoch': '0.07423', 'num_input_tokens_seen': 6034556, 'train_runtime': '3052', 'train_tokens_per_second': '1977'} +{'loss': '1.084', 'grad_norm': '2.665', 'learning_rate': '3.711e-05', 'epoch': '0.07425', 'num_input_tokens_seen': 6036603, 'train_runtime': '3053', 'train_tokens_per_second': '1977'} +{'loss': '0.3637', 'grad_norm': '1.57', 'learning_rate': '3.712e-05', 'epoch': '0.07428', 'num_input_tokens_seen': 6038650, 'train_runtime': '3054', 'train_tokens_per_second': '1977'} +{'loss': '0.8606', 'grad_norm': '2.67', 'learning_rate': '3.713e-05', 'epoch': '0.0743', 'num_input_tokens_seen': 6040697, 'train_runtime': '3055', 'train_tokens_per_second': '1977'} +{'loss': '0.4219', 'grad_norm': '1.461', 'learning_rate': '3.715e-05', 'epoch': '0.07433', 'num_input_tokens_seen': 6042744, 'train_runtime': '3056', 'train_tokens_per_second': '1977'} +{'loss': '0.5957', 'grad_norm': '2.092', 'learning_rate': '3.716e-05', 'epoch': '0.07435', 'num_input_tokens_seen': 6044791, 'train_runtime': '3057', 'train_tokens_per_second': '1977'} +{'loss': '0.6346', 'grad_norm': '2.134', 'learning_rate': '3.717e-05', 'epoch': '0.07438', 'num_input_tokens_seen': 6046838, 'train_runtime': '3058', 'train_tokens_per_second': '1977'} +{'loss': '0.939', 'grad_norm': '2.334', 'learning_rate': '3.719e-05', 'epoch': '0.0744', 'num_input_tokens_seen': 6048885, 'train_runtime': '3059', 'train_tokens_per_second': '1977'} +{'loss': '0.9992', 'grad_norm': '2.425', 'learning_rate': '3.72e-05', 'epoch': '0.07443', 'num_input_tokens_seen': 6050932, 'train_runtime': '3060', 'train_tokens_per_second': '1977'} +{'loss': '0.9775', 'grad_norm': '2.581', 'learning_rate': '3.721e-05', 'epoch': '0.07445', 'num_input_tokens_seen': 6052979, 'train_runtime': '3061', 'train_tokens_per_second': '1977'} +{'loss': '0.7141', 'grad_norm': '2.343', 'learning_rate': '3.722e-05', 'epoch': '0.07448', 'num_input_tokens_seen': 6055026, 'train_runtime': '3062', 'train_tokens_per_second': '1977'} +{'loss': '0.4575', 'grad_norm': '1.79', 'learning_rate': '3.724e-05', 'epoch': '0.0745', 'num_input_tokens_seen': 6057073, 'train_runtime': '3064', 'train_tokens_per_second': '1977'} +{'loss': '0.9783', 'grad_norm': '2.091', 'learning_rate': '3.725e-05', 'epoch': '0.07453', 'num_input_tokens_seen': 6059120, 'train_runtime': '3065', 'train_tokens_per_second': '1977'} +{'loss': '1.476', 'grad_norm': '2.733', 'learning_rate': '3.726e-05', 'epoch': '0.07455', 'num_input_tokens_seen': 6061167, 'train_runtime': '3066', 'train_tokens_per_second': '1977'} +{'loss': '0.6147', 'grad_norm': '1.983', 'learning_rate': '3.727e-05', 'epoch': '0.07458', 'num_input_tokens_seen': 6063214, 'train_runtime': '3067', 'train_tokens_per_second': '1977'} +{'loss': '0.4334', 'grad_norm': '1.75', 'learning_rate': '3.729e-05', 'epoch': '0.0746', 'num_input_tokens_seen': 6065261, 'train_runtime': '3068', 'train_tokens_per_second': '1977'} +{'loss': '1.068', 'grad_norm': '2.804', 'learning_rate': '3.73e-05', 'epoch': '0.07463', 'num_input_tokens_seen': 6067308, 'train_runtime': '3069', 'train_tokens_per_second': '1977'} +{'loss': '1.864', 'grad_norm': '2.459', 'learning_rate': '3.731e-05', 'epoch': '0.07465', 'num_input_tokens_seen': 6069355, 'train_runtime': '3070', 'train_tokens_per_second': '1977'} +{'loss': '0.7611', 'grad_norm': '1.635', 'learning_rate': '3.732e-05', 'epoch': '0.07468', 'num_input_tokens_seen': 6071402, 'train_runtime': '3071', 'train_tokens_per_second': '1977'} +{'loss': '1.166', 'grad_norm': '2.334', 'learning_rate': '3.734e-05', 'epoch': '0.0747', 'num_input_tokens_seen': 6073449, 'train_runtime': '3072', 'train_tokens_per_second': '1977'} +{'loss': '0.3718', 'grad_norm': '1.583', 'learning_rate': '3.735e-05', 'epoch': '0.07473', 'num_input_tokens_seen': 6075496, 'train_runtime': '3073', 'train_tokens_per_second': '1977'} +{'loss': '0.4433', 'grad_norm': '1.647', 'learning_rate': '3.736e-05', 'epoch': '0.07475', 'num_input_tokens_seen': 6077543, 'train_runtime': '3074', 'train_tokens_per_second': '1977'} +{'loss': '0.841', 'grad_norm': '2.476', 'learning_rate': '3.737e-05', 'epoch': '0.07478', 'num_input_tokens_seen': 6079590, 'train_runtime': '3075', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '3.102', 'learning_rate': '3.739e-05', 'epoch': '0.0748', 'num_input_tokens_seen': 6081637, 'train_runtime': '3076', 'train_tokens_per_second': '1977'} +{'loss': '0.9489', 'grad_norm': '2.629', 'learning_rate': '3.74e-05', 'epoch': '0.07483', 'num_input_tokens_seen': 6083684, 'train_runtime': '3077', 'train_tokens_per_second': '1977'} +{'loss': '0.5266', 'grad_norm': '1.573', 'learning_rate': '3.741e-05', 'epoch': '0.07485', 'num_input_tokens_seen': 6085731, 'train_runtime': '3078', 'train_tokens_per_second': '1977'} +{'loss': '0.4795', 'grad_norm': '1.871', 'learning_rate': '3.742e-05', 'epoch': '0.07488', 'num_input_tokens_seen': 6087778, 'train_runtime': '3079', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '2.836', 'learning_rate': '3.744e-05', 'epoch': '0.0749', 'num_input_tokens_seen': 6089825, 'train_runtime': '3080', 'train_tokens_per_second': '1977'} +{'loss': '0.9583', 'grad_norm': '2.675', 'learning_rate': '3.745e-05', 'epoch': '0.07493', 'num_input_tokens_seen': 6091872, 'train_runtime': '3081', 'train_tokens_per_second': '1977'} +{'loss': '1.394', 'grad_norm': '2.697', 'learning_rate': '3.746e-05', 'epoch': '0.07496', 'num_input_tokens_seen': 6093919, 'train_runtime': '3082', 'train_tokens_per_second': '1977'} +{'loss': '0.679', 'grad_norm': '1.895', 'learning_rate': '3.747e-05', 'epoch': '0.07498', 'num_input_tokens_seen': 6095966, 'train_runtime': '3083', 'train_tokens_per_second': '1977'} +{'loss': '1.619', 'grad_norm': '2.145', 'learning_rate': '3.749e-05', 'epoch': '0.07501', 'num_input_tokens_seen': 6098013, 'train_runtime': '3084', 'train_tokens_per_second': '1977'} +{'loss': '0.554', 'grad_norm': '1.746', 'learning_rate': '3.75e-05', 'epoch': '0.07503', 'num_input_tokens_seen': 6100060, 'train_runtime': '3085', 'train_tokens_per_second': '1977'} +{'loss': '1.116', 'grad_norm': '2.542', 'learning_rate': '3.751e-05', 'epoch': '0.07506', 'num_input_tokens_seen': 6102107, 'train_runtime': '3086', 'train_tokens_per_second': '1977'} +{'loss': '0.5115', 'grad_norm': '1.718', 'learning_rate': '3.753e-05', 'epoch': '0.07508', 'num_input_tokens_seen': 6104154, 'train_runtime': '3087', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '3.212', 'learning_rate': '3.754e-05', 'epoch': '0.07511', 'num_input_tokens_seen': 6106201, 'train_runtime': '3088', 'train_tokens_per_second': '1977'} +{'loss': '0.9212', 'grad_norm': '1.81', 'learning_rate': '3.755e-05', 'epoch': '0.07513', 'num_input_tokens_seen': 6108248, 'train_runtime': '3089', 'train_tokens_per_second': '1977'} +{'loss': '0.8066', 'grad_norm': '1.743', 'learning_rate': '3.756e-05', 'epoch': '0.07516', 'num_input_tokens_seen': 6110295, 'train_runtime': '3090', 'train_tokens_per_second': '1977'} +{'loss': '0.4096', 'grad_norm': '1.587', 'learning_rate': '3.758e-05', 'epoch': '0.07518', 'num_input_tokens_seen': 6112342, 'train_runtime': '3091', 'train_tokens_per_second': '1977'} +{'loss': '0.99', 'grad_norm': '2.761', 'learning_rate': '3.759e-05', 'epoch': '0.07521', 'num_input_tokens_seen': 6114389, 'train_runtime': '3092', 'train_tokens_per_second': '1977'} +{'loss': '0.8089', 'grad_norm': '2.126', 'learning_rate': '3.76e-05', 'epoch': '0.07523', 'num_input_tokens_seen': 6116436, 'train_runtime': '3094', 'train_tokens_per_second': '1977'} +{'loss': '0.4289', 'grad_norm': '1.785', 'learning_rate': '3.761e-05', 'epoch': '0.07526', 'num_input_tokens_seen': 6118483, 'train_runtime': '3095', 'train_tokens_per_second': '1977'} +{'loss': '1.018', 'grad_norm': '3.141', 'learning_rate': '3.763e-05', 'epoch': '0.07528', 'num_input_tokens_seen': 6120530, 'train_runtime': '3096', 'train_tokens_per_second': '1977'} +{'loss': '1.568', 'grad_norm': '3.143', 'learning_rate': '3.764e-05', 'epoch': '0.07531', 'num_input_tokens_seen': 6122577, 'train_runtime': '3097', 'train_tokens_per_second': '1977'} +{'loss': '0.7478', 'grad_norm': '2.303', 'learning_rate': '3.765e-05', 'epoch': '0.07533', 'num_input_tokens_seen': 6124624, 'train_runtime': '3098', 'train_tokens_per_second': '1977'} +{'loss': '0.4194', 'grad_norm': '2.135', 'learning_rate': '3.766e-05', 'epoch': '0.07536', 'num_input_tokens_seen': 6126671, 'train_runtime': '3099', 'train_tokens_per_second': '1977'} +{'loss': '0.6179', 'grad_norm': '1.965', 'learning_rate': '3.768e-05', 'epoch': '0.07538', 'num_input_tokens_seen': 6128718, 'train_runtime': '3100', 'train_tokens_per_second': '1977'} +{'loss': '1.27', 'grad_norm': '3.208', 'learning_rate': '3.769e-05', 'epoch': '0.07541', 'num_input_tokens_seen': 6130765, 'train_runtime': '3101', 'train_tokens_per_second': '1977'} +{'loss': '1.395', 'grad_norm': '3.148', 'learning_rate': '3.77e-05', 'epoch': '0.07543', 'num_input_tokens_seen': 6132812, 'train_runtime': '3102', 'train_tokens_per_second': '1977'} +{'loss': '1.358', 'grad_norm': '3.167', 'learning_rate': '3.771e-05', 'epoch': '0.07546', 'num_input_tokens_seen': 6134859, 'train_runtime': '3103', 'train_tokens_per_second': '1977'} +{'loss': '2.013', 'grad_norm': '3.15', 'learning_rate': '3.773e-05', 'epoch': '0.07548', 'num_input_tokens_seen': 6136906, 'train_runtime': '3104', 'train_tokens_per_second': '1977'} +{'loss': '0.9313', 'grad_norm': '2.195', 'learning_rate': '3.774e-05', 'epoch': '0.07551', 'num_input_tokens_seen': 6138953, 'train_runtime': '3105', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '2.353', 'learning_rate': '3.775e-05', 'epoch': '0.07553', 'num_input_tokens_seen': 6141000, 'train_runtime': '3106', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 03:29:11,441 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 03:29:11,442 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 03:29:11,823 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-3000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 03:29:11,830 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-3000/tokenizer_config.json + +{'loss': '0.3924', 'grad_norm': '1.957', 'learning_rate': '3.776e-05', 'epoch': '0.07556', 'num_input_tokens_seen': 6143047, 'train_runtime': '3108', 'train_tokens_per_second': '1977'} +{'loss': '0.7368', 'grad_norm': '2.894', 'learning_rate': '3.778e-05', 'epoch': '0.07558', 'num_input_tokens_seen': 6145094, 'train_runtime': '3109', 'train_tokens_per_second': '1977'} +{'loss': '0.612', 'grad_norm': '2.056', 'learning_rate': '3.779e-05', 'epoch': '0.07561', 'num_input_tokens_seen': 6147141, 'train_runtime': '3110', 'train_tokens_per_second': '1977'} +{'loss': '0.9184', 'grad_norm': '2.986', 'learning_rate': '3.78e-05', 'epoch': '0.07564', 'num_input_tokens_seen': 6149188, 'train_runtime': '3111', 'train_tokens_per_second': '1977'} +{'loss': '1.118', 'grad_norm': '3.106', 'learning_rate': '3.781e-05', 'epoch': '0.07566', 'num_input_tokens_seen': 6151235, 'train_runtime': '3112', 'train_tokens_per_second': '1977'} +{'loss': '1.453', 'grad_norm': '2.888', 'learning_rate': '3.783e-05', 'epoch': '0.07569', 'num_input_tokens_seen': 6153282, 'train_runtime': '3113', 'train_tokens_per_second': '1977'} +{'loss': '0.4445', 'grad_norm': '1.62', 'learning_rate': '3.784e-05', 'epoch': '0.07571', 'num_input_tokens_seen': 6155329, 'train_runtime': '3114', 'train_tokens_per_second': '1977'} +{'loss': '0.9724', 'grad_norm': '2.498', 'learning_rate': '3.785e-05', 'epoch': '0.07574', 'num_input_tokens_seen': 6157376, 'train_runtime': '3115', 'train_tokens_per_second': '1977'} +{'loss': '0.3323', 'grad_norm': '1.521', 'learning_rate': '3.787e-05', 'epoch': '0.07576', 'num_input_tokens_seen': 6159423, 'train_runtime': '3116', 'train_tokens_per_second': '1977'} +{'loss': '0.9124', 'grad_norm': '2.29', 'learning_rate': '3.788e-05', 'epoch': '0.07579', 'num_input_tokens_seen': 6161470, 'train_runtime': '3117', 'train_tokens_per_second': '1977'} +{'loss': '0.8496', 'grad_norm': '2.272', 'learning_rate': '3.789e-05', 'epoch': '0.07581', 'num_input_tokens_seen': 6163517, 'train_runtime': '3118', 'train_tokens_per_second': '1977'} +{'loss': '0.841', 'grad_norm': '1.758', 'learning_rate': '3.79e-05', 'epoch': '0.07584', 'num_input_tokens_seen': 6165564, 'train_runtime': '3119', 'train_tokens_per_second': '1977'} +{'loss': '0.339', 'grad_norm': '1.987', 'learning_rate': '3.792e-05', 'epoch': '0.07586', 'num_input_tokens_seen': 6167611, 'train_runtime': '3120', 'train_tokens_per_second': '1977'} +{'loss': '0.6285', 'grad_norm': '1.982', 'learning_rate': '3.793e-05', 'epoch': '0.07589', 'num_input_tokens_seen': 6169658, 'train_runtime': '3121', 'train_tokens_per_second': '1977'} +{'loss': '0.3671', 'grad_norm': '1.414', 'learning_rate': '3.794e-05', 'epoch': '0.07591', 'num_input_tokens_seen': 6171705, 'train_runtime': '3122', 'train_tokens_per_second': '1977'} +{'loss': '1.136', 'grad_norm': '3.237', 'learning_rate': '3.795e-05', 'epoch': '0.07594', 'num_input_tokens_seen': 6173752, 'train_runtime': '3123', 'train_tokens_per_second': '1977'} +{'loss': '1.006', 'grad_norm': '2.3', 'learning_rate': '3.797e-05', 'epoch': '0.07596', 'num_input_tokens_seen': 6175799, 'train_runtime': '3124', 'train_tokens_per_second': '1977'} +{'loss': '0.8104', 'grad_norm': '2.234', 'learning_rate': '3.798e-05', 'epoch': '0.07599', 'num_input_tokens_seen': 6177846, 'train_runtime': '3125', 'train_tokens_per_second': '1977'} +{'loss': '1.205', 'grad_norm': '2.873', 'learning_rate': '3.799e-05', 'epoch': '0.07601', 'num_input_tokens_seen': 6179893, 'train_runtime': '3126', 'train_tokens_per_second': '1977'} +{'loss': '1.269', 'grad_norm': '2.742', 'learning_rate': '3.8e-05', 'epoch': '0.07604', 'num_input_tokens_seen': 6181940, 'train_runtime': '3127', 'train_tokens_per_second': '1977'} +{'loss': '1.369', 'grad_norm': '3.18', 'learning_rate': '3.802e-05', 'epoch': '0.07606', 'num_input_tokens_seen': 6183987, 'train_runtime': '3128', 'train_tokens_per_second': '1977'} +{'loss': '1.114', 'grad_norm': '2.932', 'learning_rate': '3.803e-05', 'epoch': '0.07609', 'num_input_tokens_seen': 6186034, 'train_runtime': '3129', 'train_tokens_per_second': '1977'} +{'loss': '1.452', 'grad_norm': '3.007', 'learning_rate': '3.804e-05', 'epoch': '0.07611', 'num_input_tokens_seen': 6188081, 'train_runtime': '3130', 'train_tokens_per_second': '1977'} +{'loss': '0.3662', 'grad_norm': '1.452', 'learning_rate': '3.805e-05', 'epoch': '0.07614', 'num_input_tokens_seen': 6190128, 'train_runtime': '3131', 'train_tokens_per_second': '1977'} +{'loss': '1.242', 'grad_norm': '3.064', 'learning_rate': '3.807e-05', 'epoch': '0.07616', 'num_input_tokens_seen': 6192175, 'train_runtime': '3132', 'train_tokens_per_second': '1977'} +{'loss': '0.4245', 'grad_norm': '1.609', 'learning_rate': '3.808e-05', 'epoch': '0.07619', 'num_input_tokens_seen': 6194222, 'train_runtime': '3133', 'train_tokens_per_second': '1977'} +{'loss': '1.183', 'grad_norm': '3.701', 'learning_rate': '3.809e-05', 'epoch': '0.07621', 'num_input_tokens_seen': 6196269, 'train_runtime': '3134', 'train_tokens_per_second': '1977'} +{'loss': '0.4347', 'grad_norm': '1.802', 'learning_rate': '3.81e-05', 'epoch': '0.07624', 'num_input_tokens_seen': 6198316, 'train_runtime': '3135', 'train_tokens_per_second': '1977'} +{'loss': '0.6695', 'grad_norm': '2.154', 'learning_rate': '3.812e-05', 'epoch': '0.07626', 'num_input_tokens_seen': 6200363, 'train_runtime': '3136', 'train_tokens_per_second': '1977'} +{'loss': '1.299', 'grad_norm': '2.05', 'learning_rate': '3.813e-05', 'epoch': '0.07629', 'num_input_tokens_seen': 6202410, 'train_runtime': '3137', 'train_tokens_per_second': '1977'} +{'loss': '0.8699', 'grad_norm': '3.342', 'learning_rate': '3.814e-05', 'epoch': '0.07631', 'num_input_tokens_seen': 6204457, 'train_runtime': '3139', 'train_tokens_per_second': '1977'} +{'loss': '1.294', 'grad_norm': '2.437', 'learning_rate': '3.815e-05', 'epoch': '0.07634', 'num_input_tokens_seen': 6206504, 'train_runtime': '3140', 'train_tokens_per_second': '1977'} +{'loss': '0.5999', 'grad_norm': '1.83', 'learning_rate': '3.817e-05', 'epoch': '0.07637', 'num_input_tokens_seen': 6208551, 'train_runtime': '3141', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '3.936', 'learning_rate': '3.818e-05', 'epoch': '0.07639', 'num_input_tokens_seen': 6210598, 'train_runtime': '3142', 'train_tokens_per_second': '1977'} +{'loss': '1.433', 'grad_norm': '3.249', 'learning_rate': '3.819e-05', 'epoch': '0.07642', 'num_input_tokens_seen': 6212645, 'train_runtime': '3143', 'train_tokens_per_second': '1977'} +{'loss': '0.4293', 'grad_norm': '1.435', 'learning_rate': '3.82e-05', 'epoch': '0.07644', 'num_input_tokens_seen': 6214692, 'train_runtime': '3144', 'train_tokens_per_second': '1977'} +{'loss': '0.4075', 'grad_norm': '1.618', 'learning_rate': '3.822e-05', 'epoch': '0.07647', 'num_input_tokens_seen': 6216739, 'train_runtime': '3145', 'train_tokens_per_second': '1977'} +{'loss': '0.4494', 'grad_norm': '2.06', 'learning_rate': '3.823e-05', 'epoch': '0.07649', 'num_input_tokens_seen': 6218786, 'train_runtime': '3146', 'train_tokens_per_second': '1977'} +{'loss': '0.7413', 'grad_norm': '2.404', 'learning_rate': '3.824e-05', 'epoch': '0.07652', 'num_input_tokens_seen': 6220833, 'train_runtime': '3147', 'train_tokens_per_second': '1977'} +{'loss': '1.164', 'grad_norm': '2.761', 'learning_rate': '3.826e-05', 'epoch': '0.07654', 'num_input_tokens_seen': 6222880, 'train_runtime': '3148', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '2.37', 'learning_rate': '3.827e-05', 'epoch': '0.07657', 'num_input_tokens_seen': 6224927, 'train_runtime': '3149', 'train_tokens_per_second': '1977'} +{'loss': '3.354', 'grad_norm': '4.113', 'learning_rate': '3.828e-05', 'epoch': '0.07659', 'num_input_tokens_seen': 6226974, 'train_runtime': '3150', 'train_tokens_per_second': '1977'} +{'loss': '0.9071', 'grad_norm': '2.479', 'learning_rate': '3.829e-05', 'epoch': '0.07662', 'num_input_tokens_seen': 6229021, 'train_runtime': '3151', 'train_tokens_per_second': '1977'} +{'loss': '0.7217', 'grad_norm': '2.077', 'learning_rate': '3.831e-05', 'epoch': '0.07664', 'num_input_tokens_seen': 6231068, 'train_runtime': '3152', 'train_tokens_per_second': '1977'} +{'loss': '1.254', 'grad_norm': '2.418', 'learning_rate': '3.832e-05', 'epoch': '0.07667', 'num_input_tokens_seen': 6233115, 'train_runtime': '3153', 'train_tokens_per_second': '1977'} +{'loss': '0.7643', 'grad_norm': '2.177', 'learning_rate': '3.833e-05', 'epoch': '0.07669', 'num_input_tokens_seen': 6235162, 'train_runtime': '3154', 'train_tokens_per_second': '1977'} +{'loss': '0.871', 'grad_norm': '2.53', 'learning_rate': '3.834e-05', 'epoch': '0.07672', 'num_input_tokens_seen': 6237209, 'train_runtime': '3155', 'train_tokens_per_second': '1977'} +{'loss': '1.164', 'grad_norm': '2.55', 'learning_rate': '3.836e-05', 'epoch': '0.07674', 'num_input_tokens_seen': 6239256, 'train_runtime': '3156', 'train_tokens_per_second': '1977'} +{'loss': '1.367', 'grad_norm': '2.886', 'learning_rate': '3.837e-05', 'epoch': '0.07677', 'num_input_tokens_seen': 6241303, 'train_runtime': '3157', 'train_tokens_per_second': '1977'} +{'loss': '2.087', 'grad_norm': '2.59', 'learning_rate': '3.838e-05', 'epoch': '0.07679', 'num_input_tokens_seen': 6243350, 'train_runtime': '3158', 'train_tokens_per_second': '1977'} +{'loss': '0.9456', 'grad_norm': '2.36', 'learning_rate': '3.839e-05', 'epoch': '0.07682', 'num_input_tokens_seen': 6245397, 'train_runtime': '3159', 'train_tokens_per_second': '1977'} +{'loss': '0.4402', 'grad_norm': '1.739', 'learning_rate': '3.841e-05', 'epoch': '0.07684', 'num_input_tokens_seen': 6247444, 'train_runtime': '3160', 'train_tokens_per_second': '1977'} +{'loss': '2.565', 'grad_norm': '3.542', 'learning_rate': '3.842e-05', 'epoch': '0.07687', 'num_input_tokens_seen': 6249491, 'train_runtime': '3161', 'train_tokens_per_second': '1977'} +{'loss': '2.442', 'grad_norm': '3.553', 'learning_rate': '3.843e-05', 'epoch': '0.07689', 'num_input_tokens_seen': 6251538, 'train_runtime': '3162', 'train_tokens_per_second': '1977'} +{'loss': '0.4568', 'grad_norm': '1.552', 'learning_rate': '3.844e-05', 'epoch': '0.07692', 'num_input_tokens_seen': 6253585, 'train_runtime': '3163', 'train_tokens_per_second': '1977'} +{'loss': '0.5076', 'grad_norm': '1.614', 'learning_rate': '3.846e-05', 'epoch': '0.07694', 'num_input_tokens_seen': 6255632, 'train_runtime': '3164', 'train_tokens_per_second': '1977'} +{'loss': '0.8215', 'grad_norm': '2.204', 'learning_rate': '3.847e-05', 'epoch': '0.07697', 'num_input_tokens_seen': 6257679, 'train_runtime': '3165', 'train_tokens_per_second': '1977'} +{'loss': '1.22', 'grad_norm': '2.15', 'learning_rate': '3.848e-05', 'epoch': '0.07699', 'num_input_tokens_seen': 6259726, 'train_runtime': '3167', 'train_tokens_per_second': '1977'} +{'loss': '0.396', 'grad_norm': '1.786', 'learning_rate': '3.849e-05', 'epoch': '0.07702', 'num_input_tokens_seen': 6261773, 'train_runtime': '3168', 'train_tokens_per_second': '1977'} +{'loss': '0.4161', 'grad_norm': '1.959', 'learning_rate': '3.851e-05', 'epoch': '0.07705', 'num_input_tokens_seen': 6263820, 'train_runtime': '3169', 'train_tokens_per_second': '1977'} +{'loss': '0.3786', 'grad_norm': '2.087', 'learning_rate': '3.852e-05', 'epoch': '0.07707', 'num_input_tokens_seen': 6265867, 'train_runtime': '3170', 'train_tokens_per_second': '1977'} +{'loss': '1.82', 'grad_norm': '2.998', 'learning_rate': '3.853e-05', 'epoch': '0.0771', 'num_input_tokens_seen': 6267914, 'train_runtime': '3171', 'train_tokens_per_second': '1977'} +{'loss': '0.6227', 'grad_norm': '1.51', 'learning_rate': '3.854e-05', 'epoch': '0.07712', 'num_input_tokens_seen': 6269961, 'train_runtime': '3172', 'train_tokens_per_second': '1977'} +{'loss': '0.69', 'grad_norm': '1.997', 'learning_rate': '3.856e-05', 'epoch': '0.07715', 'num_input_tokens_seen': 6272008, 'train_runtime': '3173', 'train_tokens_per_second': '1977'} +{'loss': '1.863', 'grad_norm': '3.285', 'learning_rate': '3.857e-05', 'epoch': '0.07717', 'num_input_tokens_seen': 6274055, 'train_runtime': '3174', 'train_tokens_per_second': '1977'} +{'loss': '0.8215', 'grad_norm': '2.142', 'learning_rate': '3.858e-05', 'epoch': '0.0772', 'num_input_tokens_seen': 6276102, 'train_runtime': '3175', 'train_tokens_per_second': '1977'} +{'loss': '1.041', 'grad_norm': '2.683', 'learning_rate': '3.86e-05', 'epoch': '0.07722', 'num_input_tokens_seen': 6278149, 'train_runtime': '3176', 'train_tokens_per_second': '1977'} +{'loss': '0.9932', 'grad_norm': '2.526', 'learning_rate': '3.861e-05', 'epoch': '0.07725', 'num_input_tokens_seen': 6280196, 'train_runtime': '3177', 'train_tokens_per_second': '1977'} +{'loss': '0.6817', 'grad_norm': '1.959', 'learning_rate': '3.862e-05', 'epoch': '0.07727', 'num_input_tokens_seen': 6282243, 'train_runtime': '3178', 'train_tokens_per_second': '1977'} +{'loss': '1.887', 'grad_norm': '3.225', 'learning_rate': '3.863e-05', 'epoch': '0.0773', 'num_input_tokens_seen': 6284290, 'train_runtime': '3179', 'train_tokens_per_second': '1977'} +{'loss': '0.4265', 'grad_norm': '1.668', 'learning_rate': '3.865e-05', 'epoch': '0.07732', 'num_input_tokens_seen': 6286337, 'train_runtime': '3180', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '3.419', 'learning_rate': '3.866e-05', 'epoch': '0.07735', 'num_input_tokens_seen': 6288384, 'train_runtime': '3181', 'train_tokens_per_second': '1977'} +{'loss': '1.627', 'grad_norm': '3.367', 'learning_rate': '3.867e-05', 'epoch': '0.07737', 'num_input_tokens_seen': 6290431, 'train_runtime': '3182', 'train_tokens_per_second': '1977'} +{'loss': '0.5386', 'grad_norm': '1.738', 'learning_rate': '3.868e-05', 'epoch': '0.0774', 'num_input_tokens_seen': 6292478, 'train_runtime': '3183', 'train_tokens_per_second': '1977'} +{'loss': '1.189', 'grad_norm': '3.073', 'learning_rate': '3.87e-05', 'epoch': '0.07742', 'num_input_tokens_seen': 6294525, 'train_runtime': '3184', 'train_tokens_per_second': '1977'} +{'loss': '1.567', 'grad_norm': '3.261', 'learning_rate': '3.871e-05', 'epoch': '0.07745', 'num_input_tokens_seen': 6296572, 'train_runtime': '3185', 'train_tokens_per_second': '1977'} +{'loss': '0.7373', 'grad_norm': '1.917', 'learning_rate': '3.872e-05', 'epoch': '0.07747', 'num_input_tokens_seen': 6298619, 'train_runtime': '3186', 'train_tokens_per_second': '1977'} +{'loss': '1.549', 'grad_norm': '3.081', 'learning_rate': '3.873e-05', 'epoch': '0.0775', 'num_input_tokens_seen': 6300666, 'train_runtime': '3187', 'train_tokens_per_second': '1977'} +{'loss': '0.3529', 'grad_norm': '1.614', 'learning_rate': '3.875e-05', 'epoch': '0.07752', 'num_input_tokens_seen': 6302713, 'train_runtime': '3188', 'train_tokens_per_second': '1977'} +{'loss': '1.18', 'grad_norm': '2.945', 'learning_rate': '3.876e-05', 'epoch': '0.07755', 'num_input_tokens_seen': 6304760, 'train_runtime': '3189', 'train_tokens_per_second': '1977'} +{'loss': '1.358', 'grad_norm': '2.838', 'learning_rate': '3.877e-05', 'epoch': '0.07757', 'num_input_tokens_seen': 6306807, 'train_runtime': '3190', 'train_tokens_per_second': '1977'} +{'loss': '2.548', 'grad_norm': '3.532', 'learning_rate': '3.878e-05', 'epoch': '0.0776', 'num_input_tokens_seen': 6308854, 'train_runtime': '3191', 'train_tokens_per_second': '1977'} +{'loss': '0.7252', 'grad_norm': '2.203', 'learning_rate': '3.88e-05', 'epoch': '0.07762', 'num_input_tokens_seen': 6310901, 'train_runtime': '3192', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '2.185', 'learning_rate': '3.881e-05', 'epoch': '0.07765', 'num_input_tokens_seen': 6312948, 'train_runtime': '3193', 'train_tokens_per_second': '1977'} +{'loss': '0.9603', 'grad_norm': '1.958', 'learning_rate': '3.882e-05', 'epoch': '0.07767', 'num_input_tokens_seen': 6314995, 'train_runtime': '3194', 'train_tokens_per_second': '1977'} +{'loss': '0.4154', 'grad_norm': '1.384', 'learning_rate': '3.883e-05', 'epoch': '0.0777', 'num_input_tokens_seen': 6317042, 'train_runtime': '3195', 'train_tokens_per_second': '1977'} +{'loss': '1.104', 'grad_norm': '3.116', 'learning_rate': '3.885e-05', 'epoch': '0.07772', 'num_input_tokens_seen': 6319089, 'train_runtime': '3196', 'train_tokens_per_second': '1977'} +{'loss': '0.8383', 'grad_norm': '2.158', 'learning_rate': '3.886e-05', 'epoch': '0.07775', 'num_input_tokens_seen': 6321136, 'train_runtime': '3198', 'train_tokens_per_second': '1977'} +{'loss': '0.357', 'grad_norm': '1.253', 'learning_rate': '3.887e-05', 'epoch': '0.07778', 'num_input_tokens_seen': 6323183, 'train_runtime': '3199', 'train_tokens_per_second': '1977'} +{'loss': '0.381', 'grad_norm': '1.789', 'learning_rate': '3.888e-05', 'epoch': '0.0778', 'num_input_tokens_seen': 6325230, 'train_runtime': '3200', 'train_tokens_per_second': '1977'} +{'loss': '0.3236', 'grad_norm': '1.558', 'learning_rate': '3.89e-05', 'epoch': '0.07783', 'num_input_tokens_seen': 6327277, 'train_runtime': '3201', 'train_tokens_per_second': '1977'} +{'loss': '1.8', 'grad_norm': '2.867', 'learning_rate': '3.891e-05', 'epoch': '0.07785', 'num_input_tokens_seen': 6329324, 'train_runtime': '3202', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '2.425', 'learning_rate': '3.892e-05', 'epoch': '0.07788', 'num_input_tokens_seen': 6331371, 'train_runtime': '3203', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '2.909', 'learning_rate': '3.894e-05', 'epoch': '0.0779', 'num_input_tokens_seen': 6333418, 'train_runtime': '3204', 'train_tokens_per_second': '1977'} +{'loss': '1.372', 'grad_norm': '2.685', 'learning_rate': '3.895e-05', 'epoch': '0.07793', 'num_input_tokens_seen': 6335465, 'train_runtime': '3205', 'train_tokens_per_second': '1977'} +{'loss': '1.142', 'grad_norm': '2.72', 'learning_rate': '3.896e-05', 'epoch': '0.07795', 'num_input_tokens_seen': 6337512, 'train_runtime': '3206', 'train_tokens_per_second': '1977'} +{'loss': '1.865', 'grad_norm': '3.029', 'learning_rate': '3.897e-05', 'epoch': '0.07798', 'num_input_tokens_seen': 6339559, 'train_runtime': '3207', 'train_tokens_per_second': '1977'} +{'loss': '0.7864', 'grad_norm': '2.481', 'learning_rate': '3.899e-05', 'epoch': '0.078', 'num_input_tokens_seen': 6341606, 'train_runtime': '3208', 'train_tokens_per_second': '1977'} +{'loss': '0.9675', 'grad_norm': '2.33', 'learning_rate': '3.9e-05', 'epoch': '0.07803', 'num_input_tokens_seen': 6343653, 'train_runtime': '3209', 'train_tokens_per_second': '1977'} +{'loss': '0.9288', 'grad_norm': '2.201', 'learning_rate': '3.901e-05', 'epoch': '0.07805', 'num_input_tokens_seen': 6345700, 'train_runtime': '3210', 'train_tokens_per_second': '1977'} +{'loss': '1.019', 'grad_norm': '2.21', 'learning_rate': '3.902e-05', 'epoch': '0.07808', 'num_input_tokens_seen': 6347747, 'train_runtime': '3211', 'train_tokens_per_second': '1977'} +{'loss': '0.502', 'grad_norm': '1.795', 'learning_rate': '3.904e-05', 'epoch': '0.0781', 'num_input_tokens_seen': 6349794, 'train_runtime': '3212', 'train_tokens_per_second': '1977'} +{'loss': '0.7941', 'grad_norm': '1.968', 'learning_rate': '3.905e-05', 'epoch': '0.07813', 'num_input_tokens_seen': 6351841, 'train_runtime': '3213', 'train_tokens_per_second': '1977'} +{'loss': '0.6544', 'grad_norm': '1.782', 'learning_rate': '3.906e-05', 'epoch': '0.07815', 'num_input_tokens_seen': 6353888, 'train_runtime': '3214', 'train_tokens_per_second': '1977'} +{'loss': '1.438', 'grad_norm': '3', 'learning_rate': '3.907e-05', 'epoch': '0.07818', 'num_input_tokens_seen': 6355935, 'train_runtime': '3215', 'train_tokens_per_second': '1977'} +{'loss': '1.696', 'grad_norm': '2.877', 'learning_rate': '3.909e-05', 'epoch': '0.0782', 'num_input_tokens_seen': 6357982, 'train_runtime': '3216', 'train_tokens_per_second': '1977'} +{'loss': '2.454', 'grad_norm': '2.83', 'learning_rate': '3.91e-05', 'epoch': '0.07823', 'num_input_tokens_seen': 6360029, 'train_runtime': '3217', 'train_tokens_per_second': '1977'} +{'loss': '0.8821', 'grad_norm': '2.439', 'learning_rate': '3.911e-05', 'epoch': '0.07825', 'num_input_tokens_seen': 6362076, 'train_runtime': '3218', 'train_tokens_per_second': '1977'} +{'loss': '0.4665', 'grad_norm': '1.891', 'learning_rate': '3.912e-05', 'epoch': '0.07828', 'num_input_tokens_seen': 6364123, 'train_runtime': '3219', 'train_tokens_per_second': '1977'} +{'loss': '1.063', 'grad_norm': '2.444', 'learning_rate': '3.914e-05', 'epoch': '0.0783', 'num_input_tokens_seen': 6366170, 'train_runtime': '3220', 'train_tokens_per_second': '1977'} +{'loss': '0.5167', 'grad_norm': '1.841', 'learning_rate': '3.915e-05', 'epoch': '0.07833', 'num_input_tokens_seen': 6368217, 'train_runtime': '3221', 'train_tokens_per_second': '1977'} +{'loss': '0.4842', 'grad_norm': '2.026', 'learning_rate': '3.916e-05', 'epoch': '0.07835', 'num_input_tokens_seen': 6370264, 'train_runtime': '3222', 'train_tokens_per_second': '1977'} +{'loss': '0.6082', 'grad_norm': '2.005', 'learning_rate': '3.917e-05', 'epoch': '0.07838', 'num_input_tokens_seen': 6372311, 'train_runtime': '3223', 'train_tokens_per_second': '1977'} +{'loss': '0.8149', 'grad_norm': '2.153', 'learning_rate': '3.919e-05', 'epoch': '0.0784', 'num_input_tokens_seen': 6374358, 'train_runtime': '3224', 'train_tokens_per_second': '1977'} +{'loss': '1.359', 'grad_norm': '2.686', 'learning_rate': '3.92e-05', 'epoch': '0.07843', 'num_input_tokens_seen': 6376405, 'train_runtime': '3225', 'train_tokens_per_second': '1977'} +{'loss': '0.761', 'grad_norm': '1.749', 'learning_rate': '3.921e-05', 'epoch': '0.07846', 'num_input_tokens_seen': 6378452, 'train_runtime': '3226', 'train_tokens_per_second': '1977'} +{'loss': '0.3833', 'grad_norm': '1.53', 'learning_rate': '3.922e-05', 'epoch': '0.07848', 'num_input_tokens_seen': 6380499, 'train_runtime': '3228', 'train_tokens_per_second': '1977'} +{'loss': '0.6351', 'grad_norm': '1.908', 'learning_rate': '3.924e-05', 'epoch': '0.07851', 'num_input_tokens_seen': 6382546, 'train_runtime': '3229', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '2.385', 'learning_rate': '3.925e-05', 'epoch': '0.07853', 'num_input_tokens_seen': 6384593, 'train_runtime': '3230', 'train_tokens_per_second': '1977'} +{'loss': '1.394', 'grad_norm': '3.534', 'learning_rate': '3.926e-05', 'epoch': '0.07856', 'num_input_tokens_seen': 6386640, 'train_runtime': '3231', 'train_tokens_per_second': '1977'} +{'loss': '0.3708', 'grad_norm': '1.578', 'learning_rate': '3.927e-05', 'epoch': '0.07858', 'num_input_tokens_seen': 6388687, 'train_runtime': '3232', 'train_tokens_per_second': '1977'} +{'loss': '0.6187', 'grad_norm': '2.048', 'learning_rate': '3.929e-05', 'epoch': '0.07861', 'num_input_tokens_seen': 6390734, 'train_runtime': '3233', 'train_tokens_per_second': '1977'} +{'loss': '0.3825', 'grad_norm': '1.688', 'learning_rate': '3.93e-05', 'epoch': '0.07863', 'num_input_tokens_seen': 6392781, 'train_runtime': '3234', 'train_tokens_per_second': '1977'} +{'loss': '0.4499', 'grad_norm': '1.976', 'learning_rate': '3.931e-05', 'epoch': '0.07866', 'num_input_tokens_seen': 6394828, 'train_runtime': '3235', 'train_tokens_per_second': '1977'} +{'loss': '1.825', 'grad_norm': '3.686', 'learning_rate': '3.933e-05', 'epoch': '0.07868', 'num_input_tokens_seen': 6396875, 'train_runtime': '3236', 'train_tokens_per_second': '1977'} +{'loss': '0.7419', 'grad_norm': '2.23', 'learning_rate': '3.934e-05', 'epoch': '0.07871', 'num_input_tokens_seen': 6398922, 'train_runtime': '3237', 'train_tokens_per_second': '1977'} +{'loss': '0.535', 'grad_norm': '1.585', 'learning_rate': '3.935e-05', 'epoch': '0.07873', 'num_input_tokens_seen': 6400969, 'train_runtime': '3238', 'train_tokens_per_second': '1977'} +{'loss': '0.4506', 'grad_norm': '1.766', 'learning_rate': '3.936e-05', 'epoch': '0.07876', 'num_input_tokens_seen': 6403016, 'train_runtime': '3239', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.961', 'learning_rate': '3.938e-05', 'epoch': '0.07878', 'num_input_tokens_seen': 6405063, 'train_runtime': '3240', 'train_tokens_per_second': '1977'} +{'loss': '0.8761', 'grad_norm': '2.136', 'learning_rate': '3.939e-05', 'epoch': '0.07881', 'num_input_tokens_seen': 6407110, 'train_runtime': '3241', 'train_tokens_per_second': '1977'} +{'loss': '0.3906', 'grad_norm': '1.859', 'learning_rate': '3.94e-05', 'epoch': '0.07883', 'num_input_tokens_seen': 6409157, 'train_runtime': '3242', 'train_tokens_per_second': '1977'} +{'loss': '0.6895', 'grad_norm': '1.885', 'learning_rate': '3.941e-05', 'epoch': '0.07886', 'num_input_tokens_seen': 6411204, 'train_runtime': '3243', 'train_tokens_per_second': '1977'} +{'loss': '0.8057', 'grad_norm': '2.538', 'learning_rate': '3.943e-05', 'epoch': '0.07888', 'num_input_tokens_seen': 6413251, 'train_runtime': '3244', 'train_tokens_per_second': '1977'} +{'loss': '0.298', 'grad_norm': '1.439', 'learning_rate': '3.944e-05', 'epoch': '0.07891', 'num_input_tokens_seen': 6415298, 'train_runtime': '3245', 'train_tokens_per_second': '1977'} +{'loss': '0.9241', 'grad_norm': '2.338', 'learning_rate': '3.945e-05', 'epoch': '0.07893', 'num_input_tokens_seen': 6417345, 'train_runtime': '3246', 'train_tokens_per_second': '1977'} +{'loss': '0.8326', 'grad_norm': '1.841', 'learning_rate': '3.946e-05', 'epoch': '0.07896', 'num_input_tokens_seen': 6419392, 'train_runtime': '3247', 'train_tokens_per_second': '1977'} +{'loss': '0.4173', 'grad_norm': '1.696', 'learning_rate': '3.948e-05', 'epoch': '0.07898', 'num_input_tokens_seen': 6421439, 'train_runtime': '3248', 'train_tokens_per_second': '1977'} +{'loss': '0.3869', 'grad_norm': '1.624', 'learning_rate': '3.949e-05', 'epoch': '0.07901', 'num_input_tokens_seen': 6423486, 'train_runtime': '3249', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '2.536', 'learning_rate': '3.95e-05', 'epoch': '0.07903', 'num_input_tokens_seen': 6425533, 'train_runtime': '3250', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '3.54', 'learning_rate': '3.951e-05', 'epoch': '0.07906', 'num_input_tokens_seen': 6427580, 'train_runtime': '3251', 'train_tokens_per_second': '1977'} +{'loss': '0.7736', 'grad_norm': '2.177', 'learning_rate': '3.953e-05', 'epoch': '0.07908', 'num_input_tokens_seen': 6429627, 'train_runtime': '3252', 'train_tokens_per_second': '1977'} +{'loss': '0.9586', 'grad_norm': '2.765', 'learning_rate': '3.954e-05', 'epoch': '0.07911', 'num_input_tokens_seen': 6431674, 'train_runtime': '3253', 'train_tokens_per_second': '1977'} +{'loss': '1.386', 'grad_norm': '2.499', 'learning_rate': '3.955e-05', 'epoch': '0.07913', 'num_input_tokens_seen': 6433721, 'train_runtime': '3254', 'train_tokens_per_second': '1977'} +{'loss': '1.75', 'grad_norm': '3.259', 'learning_rate': '3.956e-05', 'epoch': '0.07916', 'num_input_tokens_seen': 6435768, 'train_runtime': '3255', 'train_tokens_per_second': '1977'} +{'loss': '0.7287', 'grad_norm': '2.662', 'learning_rate': '3.958e-05', 'epoch': '0.07919', 'num_input_tokens_seen': 6437815, 'train_runtime': '3256', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '2.872', 'learning_rate': '3.959e-05', 'epoch': '0.07921', 'num_input_tokens_seen': 6439862, 'train_runtime': '3258', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '2.156', 'learning_rate': '3.96e-05', 'epoch': '0.07924', 'num_input_tokens_seen': 6441909, 'train_runtime': '3259', 'train_tokens_per_second': '1977'} +{'loss': '0.4736', 'grad_norm': '1.748', 'learning_rate': '3.961e-05', 'epoch': '0.07926', 'num_input_tokens_seen': 6443956, 'train_runtime': '3260', 'train_tokens_per_second': '1977'} +{'loss': '0.6973', 'grad_norm': '2.105', 'learning_rate': '3.963e-05', 'epoch': '0.07929', 'num_input_tokens_seen': 6446003, 'train_runtime': '3261', 'train_tokens_per_second': '1977'} +{'loss': '1.67', 'grad_norm': '3.641', 'learning_rate': '3.964e-05', 'epoch': '0.07931', 'num_input_tokens_seen': 6448050, 'train_runtime': '3262', 'train_tokens_per_second': '1977'} +{'loss': '0.8628', 'grad_norm': '2.386', 'learning_rate': '3.965e-05', 'epoch': '0.07934', 'num_input_tokens_seen': 6450097, 'train_runtime': '3263', 'train_tokens_per_second': '1977'} +{'loss': '0.6454', 'grad_norm': '2.186', 'learning_rate': '3.967e-05', 'epoch': '0.07936', 'num_input_tokens_seen': 6452144, 'train_runtime': '3264', 'train_tokens_per_second': '1977'} +{'loss': '0.6156', 'grad_norm': '1.782', 'learning_rate': '3.968e-05', 'epoch': '0.07939', 'num_input_tokens_seen': 6454191, 'train_runtime': '3265', 'train_tokens_per_second': '1977'} +{'loss': '1.357', 'grad_norm': '2.661', 'learning_rate': '3.969e-05', 'epoch': '0.07941', 'num_input_tokens_seen': 6456238, 'train_runtime': '3266', 'train_tokens_per_second': '1977'} +{'loss': '1.495', 'grad_norm': '2.915', 'learning_rate': '3.97e-05', 'epoch': '0.07944', 'num_input_tokens_seen': 6458285, 'train_runtime': '3267', 'train_tokens_per_second': '1977'} +{'loss': '0.6367', 'grad_norm': '1.821', 'learning_rate': '3.972e-05', 'epoch': '0.07946', 'num_input_tokens_seen': 6460332, 'train_runtime': '3268', 'train_tokens_per_second': '1977'} +{'loss': '0.4549', 'grad_norm': '1.782', 'learning_rate': '3.973e-05', 'epoch': '0.07949', 'num_input_tokens_seen': 6462379, 'train_runtime': '3269', 'train_tokens_per_second': '1977'} +{'loss': '1.635', 'grad_norm': '3.425', 'learning_rate': '3.974e-05', 'epoch': '0.07951', 'num_input_tokens_seen': 6464426, 'train_runtime': '3270', 'train_tokens_per_second': '1977'} +{'loss': '0.5582', 'grad_norm': '1.72', 'learning_rate': '3.975e-05', 'epoch': '0.07954', 'num_input_tokens_seen': 6466473, 'train_runtime': '3271', 'train_tokens_per_second': '1977'} +{'loss': '0.4332', 'grad_norm': '2.162', 'learning_rate': '3.977e-05', 'epoch': '0.07956', 'num_input_tokens_seen': 6468520, 'train_runtime': '3272', 'train_tokens_per_second': '1977'} +{'loss': '1.265', 'grad_norm': '2.741', 'learning_rate': '3.978e-05', 'epoch': '0.07959', 'num_input_tokens_seen': 6470567, 'train_runtime': '3273', 'train_tokens_per_second': '1977'} +{'loss': '0.9123', 'grad_norm': '1.719', 'learning_rate': '3.979e-05', 'epoch': '0.07961', 'num_input_tokens_seen': 6472614, 'train_runtime': '3274', 'train_tokens_per_second': '1977'} +{'loss': '0.4365', 'grad_norm': '1.997', 'learning_rate': '3.98e-05', 'epoch': '0.07964', 'num_input_tokens_seen': 6474661, 'train_runtime': '3275', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '2.684', 'learning_rate': '3.982e-05', 'epoch': '0.07966', 'num_input_tokens_seen': 6476708, 'train_runtime': '3276', 'train_tokens_per_second': '1977'} +{'loss': '1.364', 'grad_norm': '3.236', 'learning_rate': '3.983e-05', 'epoch': '0.07969', 'num_input_tokens_seen': 6478755, 'train_runtime': '3277', 'train_tokens_per_second': '1977'} +{'loss': '0.4816', 'grad_norm': '1.657', 'learning_rate': '3.984e-05', 'epoch': '0.07971', 'num_input_tokens_seen': 6480802, 'train_runtime': '3278', 'train_tokens_per_second': '1977'} +{'loss': '0.6821', 'grad_norm': '1.978', 'learning_rate': '3.985e-05', 'epoch': '0.07974', 'num_input_tokens_seen': 6482849, 'train_runtime': '3279', 'train_tokens_per_second': '1977'} +{'loss': '0.6073', 'grad_norm': '1.756', 'learning_rate': '3.987e-05', 'epoch': '0.07976', 'num_input_tokens_seen': 6484896, 'train_runtime': '3280', 'train_tokens_per_second': '1977'} +{'loss': '0.778', 'grad_norm': '2.461', 'learning_rate': '3.988e-05', 'epoch': '0.07979', 'num_input_tokens_seen': 6486943, 'train_runtime': '3281', 'train_tokens_per_second': '1977'} +{'loss': '0.7442', 'grad_norm': '1.902', 'learning_rate': '3.989e-05', 'epoch': '0.07981', 'num_input_tokens_seen': 6488990, 'train_runtime': '3282', 'train_tokens_per_second': '1977'} +{'loss': '1.063', 'grad_norm': '2.541', 'learning_rate': '3.99e-05', 'epoch': '0.07984', 'num_input_tokens_seen': 6491037, 'train_runtime': '3283', 'train_tokens_per_second': '1977'} +{'loss': '0.9017', 'grad_norm': '2.264', 'learning_rate': '3.992e-05', 'epoch': '0.07987', 'num_input_tokens_seen': 6493084, 'train_runtime': '3284', 'train_tokens_per_second': '1977'} +{'loss': '0.4015', 'grad_norm': '1.599', 'learning_rate': '3.993e-05', 'epoch': '0.07989', 'num_input_tokens_seen': 6495131, 'train_runtime': '3285', 'train_tokens_per_second': '1977'} +{'loss': '0.9169', 'grad_norm': '2.26', 'learning_rate': '3.994e-05', 'epoch': '0.07992', 'num_input_tokens_seen': 6497178, 'train_runtime': '3287', 'train_tokens_per_second': '1977'} +{'loss': '1.696', 'grad_norm': '3.259', 'learning_rate': '3.995e-05', 'epoch': '0.07994', 'num_input_tokens_seen': 6499225, 'train_runtime': '3288', 'train_tokens_per_second': '1977'} +{'loss': '0.3825', 'grad_norm': '1.481', 'learning_rate': '3.997e-05', 'epoch': '0.07997', 'num_input_tokens_seen': 6501272, 'train_runtime': '3289', 'train_tokens_per_second': '1977'} +{'loss': '1.208', 'grad_norm': '2.287', 'learning_rate': '3.998e-05', 'epoch': '0.07999', 'num_input_tokens_seen': 6503319, 'train_runtime': '3290', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '2.639', 'learning_rate': '3.999e-05', 'epoch': '0.08002', 'num_input_tokens_seen': 6505366, 'train_runtime': '3291', 'train_tokens_per_second': '1977'} +{'loss': '0.9549', 'grad_norm': '2.072', 'learning_rate': '4.001e-05', 'epoch': '0.08004', 'num_input_tokens_seen': 6507413, 'train_runtime': '3292', 'train_tokens_per_second': '1977'} +{'loss': '2.509', 'grad_norm': '3.363', 'learning_rate': '4.002e-05', 'epoch': '0.08007', 'num_input_tokens_seen': 6509460, 'train_runtime': '3293', 'train_tokens_per_second': '1977'} +{'loss': '0.3638', 'grad_norm': '1.807', 'learning_rate': '4.003e-05', 'epoch': '0.08009', 'num_input_tokens_seen': 6511507, 'train_runtime': '3294', 'train_tokens_per_second': '1977'} +{'loss': '0.9542', 'grad_norm': '2.386', 'learning_rate': '4.004e-05', 'epoch': '0.08012', 'num_input_tokens_seen': 6513554, 'train_runtime': '3295', 'train_tokens_per_second': '1977'} +{'loss': '0.3999', 'grad_norm': '1.356', 'learning_rate': '4.006e-05', 'epoch': '0.08014', 'num_input_tokens_seen': 6515601, 'train_runtime': '3296', 'train_tokens_per_second': '1977'} +{'loss': '1.702', 'grad_norm': '2.922', 'learning_rate': '4.007e-05', 'epoch': '0.08017', 'num_input_tokens_seen': 6517648, 'train_runtime': '3297', 'train_tokens_per_second': '1977'} +{'loss': '0.9272', 'grad_norm': '2.296', 'learning_rate': '4.008e-05', 'epoch': '0.08019', 'num_input_tokens_seen': 6519695, 'train_runtime': '3298', 'train_tokens_per_second': '1977'} +{'loss': '0.773', 'grad_norm': '2.432', 'learning_rate': '4.009e-05', 'epoch': '0.08022', 'num_input_tokens_seen': 6521742, 'train_runtime': '3299', 'train_tokens_per_second': '1977'} +{'loss': '1.223', 'grad_norm': '2.555', 'learning_rate': '4.011e-05', 'epoch': '0.08024', 'num_input_tokens_seen': 6523789, 'train_runtime': '3300', 'train_tokens_per_second': '1977'} +{'loss': '1.322', 'grad_norm': '3.12', 'learning_rate': '4.012e-05', 'epoch': '0.08027', 'num_input_tokens_seen': 6525836, 'train_runtime': '3301', 'train_tokens_per_second': '1977'} +{'loss': '2.17', 'grad_norm': '3.658', 'learning_rate': '4.013e-05', 'epoch': '0.08029', 'num_input_tokens_seen': 6527883, 'train_runtime': '3302', 'train_tokens_per_second': '1977'} +{'loss': '1.789', 'grad_norm': '3.892', 'learning_rate': '4.014e-05', 'epoch': '0.08032', 'num_input_tokens_seen': 6529930, 'train_runtime': '3303', 'train_tokens_per_second': '1977'} +{'loss': '0.7778', 'grad_norm': '1.974', 'learning_rate': '4.016e-05', 'epoch': '0.08034', 'num_input_tokens_seen': 6531977, 'train_runtime': '3304', 'train_tokens_per_second': '1977'} +{'loss': '0.4386', 'grad_norm': '1.965', 'learning_rate': '4.017e-05', 'epoch': '0.08037', 'num_input_tokens_seen': 6534024, 'train_runtime': '3305', 'train_tokens_per_second': '1977'} +{'loss': '0.8838', 'grad_norm': '2.381', 'learning_rate': '4.018e-05', 'epoch': '0.08039', 'num_input_tokens_seen': 6536071, 'train_runtime': '3306', 'train_tokens_per_second': '1977'} +{'loss': '0.388', 'grad_norm': '1.311', 'learning_rate': '4.019e-05', 'epoch': '0.08042', 'num_input_tokens_seen': 6538118, 'train_runtime': '3307', 'train_tokens_per_second': '1977'} +{'loss': '0.4314', 'grad_norm': '1.464', 'learning_rate': '4.021e-05', 'epoch': '0.08044', 'num_input_tokens_seen': 6540165, 'train_runtime': '3308', 'train_tokens_per_second': '1977'} +{'loss': '0.8086', 'grad_norm': '2.11', 'learning_rate': '4.022e-05', 'epoch': '0.08047', 'num_input_tokens_seen': 6542212, 'train_runtime': '3309', 'train_tokens_per_second': '1977'} +{'loss': '1.068', 'grad_norm': '2.148', 'learning_rate': '4.023e-05', 'epoch': '0.08049', 'num_input_tokens_seen': 6544259, 'train_runtime': '3310', 'train_tokens_per_second': '1977'} +{'loss': '0.4367', 'grad_norm': '1.278', 'learning_rate': '4.024e-05', 'epoch': '0.08052', 'num_input_tokens_seen': 6546306, 'train_runtime': '3311', 'train_tokens_per_second': '1977'} +{'loss': '2.075', 'grad_norm': '2.991', 'learning_rate': '4.026e-05', 'epoch': '0.08054', 'num_input_tokens_seen': 6548353, 'train_runtime': '3312', 'train_tokens_per_second': '1977'} +{'loss': '0.7406', 'grad_norm': '1.996', 'learning_rate': '4.027e-05', 'epoch': '0.08057', 'num_input_tokens_seen': 6550400, 'train_runtime': '3313', 'train_tokens_per_second': '1977'} +{'loss': '0.5898', 'grad_norm': '1.602', 'learning_rate': '4.028e-05', 'epoch': '0.0806', 'num_input_tokens_seen': 6552447, 'train_runtime': '3314', 'train_tokens_per_second': '1977'} +{'loss': '1.559', 'grad_norm': '3.233', 'learning_rate': '4.029e-05', 'epoch': '0.08062', 'num_input_tokens_seen': 6554494, 'train_runtime': '3315', 'train_tokens_per_second': '1977'} +{'loss': '0.3825', 'grad_norm': '1.608', 'learning_rate': '4.031e-05', 'epoch': '0.08065', 'num_input_tokens_seen': 6556541, 'train_runtime': '3317', 'train_tokens_per_second': '1977'} +{'loss': '0.8623', 'grad_norm': '2.781', 'learning_rate': '4.032e-05', 'epoch': '0.08067', 'num_input_tokens_seen': 6558588, 'train_runtime': '3318', 'train_tokens_per_second': '1977'} +{'loss': '0.5113', 'grad_norm': '1.956', 'learning_rate': '4.033e-05', 'epoch': '0.0807', 'num_input_tokens_seen': 6560635, 'train_runtime': '3319', 'train_tokens_per_second': '1977'} +{'loss': '0.4581', 'grad_norm': '1.82', 'learning_rate': '4.034e-05', 'epoch': '0.08072', 'num_input_tokens_seen': 6562682, 'train_runtime': '3320', 'train_tokens_per_second': '1977'} +{'loss': '0.3476', 'grad_norm': '1.476', 'learning_rate': '4.036e-05', 'epoch': '0.08075', 'num_input_tokens_seen': 6564729, 'train_runtime': '3321', 'train_tokens_per_second': '1977'} +{'loss': '1.295', 'grad_norm': '3.009', 'learning_rate': '4.037e-05', 'epoch': '0.08077', 'num_input_tokens_seen': 6566776, 'train_runtime': '3322', 'train_tokens_per_second': '1977'} +{'loss': '0.3605', 'grad_norm': '2.13', 'learning_rate': '4.038e-05', 'epoch': '0.0808', 'num_input_tokens_seen': 6568823, 'train_runtime': '3323', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '3.324', 'learning_rate': '4.04e-05', 'epoch': '0.08082', 'num_input_tokens_seen': 6570870, 'train_runtime': '3324', 'train_tokens_per_second': '1977'} +{'loss': '1.135', 'grad_norm': '2.61', 'learning_rate': '4.041e-05', 'epoch': '0.08085', 'num_input_tokens_seen': 6572917, 'train_runtime': '3325', 'train_tokens_per_second': '1977'} +{'loss': '0.5288', 'grad_norm': '2.116', 'learning_rate': '4.042e-05', 'epoch': '0.08087', 'num_input_tokens_seen': 6574964, 'train_runtime': '3326', 'train_tokens_per_second': '1977'} +{'loss': '1.216', 'grad_norm': '2.884', 'learning_rate': '4.043e-05', 'epoch': '0.0809', 'num_input_tokens_seen': 6577011, 'train_runtime': '3327', 'train_tokens_per_second': '1977'} +{'loss': '0.9955', 'grad_norm': '2.312', 'learning_rate': '4.045e-05', 'epoch': '0.08092', 'num_input_tokens_seen': 6579058, 'train_runtime': '3328', 'train_tokens_per_second': '1977'} +{'loss': '1.275', 'grad_norm': '2.918', 'learning_rate': '4.046e-05', 'epoch': '0.08095', 'num_input_tokens_seen': 6581105, 'train_runtime': '3329', 'train_tokens_per_second': '1977'} +{'loss': '1.3', 'grad_norm': '2.923', 'learning_rate': '4.047e-05', 'epoch': '0.08097', 'num_input_tokens_seen': 6583152, 'train_runtime': '3330', 'train_tokens_per_second': '1977'} +{'loss': '0.7707', 'grad_norm': '2.543', 'learning_rate': '4.048e-05', 'epoch': '0.081', 'num_input_tokens_seen': 6585199, 'train_runtime': '3331', 'train_tokens_per_second': '1977'} +{'loss': '0.5114', 'grad_norm': '2.029', 'learning_rate': '4.05e-05', 'epoch': '0.08102', 'num_input_tokens_seen': 6587246, 'train_runtime': '3332', 'train_tokens_per_second': '1977'} +{'loss': '1.16', 'grad_norm': '2.662', 'learning_rate': '4.051e-05', 'epoch': '0.08105', 'num_input_tokens_seen': 6589293, 'train_runtime': '3333', 'train_tokens_per_second': '1977'} +{'loss': '2.354', 'grad_norm': '2.686', 'learning_rate': '4.052e-05', 'epoch': '0.08107', 'num_input_tokens_seen': 6591340, 'train_runtime': '3334', 'train_tokens_per_second': '1977'} +{'loss': '1.458', 'grad_norm': '2.806', 'learning_rate': '4.053e-05', 'epoch': '0.0811', 'num_input_tokens_seen': 6593387, 'train_runtime': '3335', 'train_tokens_per_second': '1977'} +{'loss': '0.893', 'grad_norm': '2.103', 'learning_rate': '4.055e-05', 'epoch': '0.08112', 'num_input_tokens_seen': 6595434, 'train_runtime': '3336', 'train_tokens_per_second': '1977'} +{'loss': '0.7207', 'grad_norm': '1.997', 'learning_rate': '4.056e-05', 'epoch': '0.08115', 'num_input_tokens_seen': 6597481, 'train_runtime': '3337', 'train_tokens_per_second': '1977'} +{'loss': '2.309', 'grad_norm': '3.192', 'learning_rate': '4.057e-05', 'epoch': '0.08117', 'num_input_tokens_seen': 6599528, 'train_runtime': '3338', 'train_tokens_per_second': '1977'} +{'loss': '0.8994', 'grad_norm': '1.872', 'learning_rate': '4.058e-05', 'epoch': '0.0812', 'num_input_tokens_seen': 6601575, 'train_runtime': '3339', 'train_tokens_per_second': '1977'} +{'loss': '0.456', 'grad_norm': '1.705', 'learning_rate': '4.06e-05', 'epoch': '0.08122', 'num_input_tokens_seen': 6603622, 'train_runtime': '3340', 'train_tokens_per_second': '1977'} +{'loss': '1.278', 'grad_norm': '2.263', 'learning_rate': '4.061e-05', 'epoch': '0.08125', 'num_input_tokens_seen': 6605669, 'train_runtime': '3341', 'train_tokens_per_second': '1977'} +{'loss': '0.407', 'grad_norm': '1.495', 'learning_rate': '4.062e-05', 'epoch': '0.08128', 'num_input_tokens_seen': 6607716, 'train_runtime': '3342', 'train_tokens_per_second': '1977'} +{'loss': '0.3273', 'grad_norm': '1.379', 'learning_rate': '4.063e-05', 'epoch': '0.0813', 'num_input_tokens_seen': 6609763, 'train_runtime': '3343', 'train_tokens_per_second': '1977'} +{'loss': '0.4148', 'grad_norm': '1.553', 'learning_rate': '4.065e-05', 'epoch': '0.08133', 'num_input_tokens_seen': 6611810, 'train_runtime': '3344', 'train_tokens_per_second': '1977'} +{'loss': '3.395', 'grad_norm': '3.079', 'learning_rate': '4.066e-05', 'epoch': '0.08135', 'num_input_tokens_seen': 6613857, 'train_runtime': '3345', 'train_tokens_per_second': '1977'} +{'loss': '0.6573', 'grad_norm': '1.754', 'learning_rate': '4.067e-05', 'epoch': '0.08138', 'num_input_tokens_seen': 6615904, 'train_runtime': '3347', 'train_tokens_per_second': '1977'} +{'loss': '0.4082', 'grad_norm': '1.546', 'learning_rate': '4.068e-05', 'epoch': '0.0814', 'num_input_tokens_seen': 6617951, 'train_runtime': '3348', 'train_tokens_per_second': '1977'} +{'loss': '0.529', 'grad_norm': '1.666', 'learning_rate': '4.07e-05', 'epoch': '0.08143', 'num_input_tokens_seen': 6619998, 'train_runtime': '3349', 'train_tokens_per_second': '1977'} +{'loss': '0.7862', 'grad_norm': '2.118', 'learning_rate': '4.071e-05', 'epoch': '0.08145', 'num_input_tokens_seen': 6622045, 'train_runtime': '3350', 'train_tokens_per_second': '1977'} +{'loss': '2.735', 'grad_norm': '3.27', 'learning_rate': '4.072e-05', 'epoch': '0.08148', 'num_input_tokens_seen': 6624092, 'train_runtime': '3351', 'train_tokens_per_second': '1977'} +{'loss': '0.6207', 'grad_norm': '2.751', 'learning_rate': '4.074e-05', 'epoch': '0.0815', 'num_input_tokens_seen': 6626139, 'train_runtime': '3352', 'train_tokens_per_second': '1977'} +{'loss': '0.3671', 'grad_norm': '1.756', 'learning_rate': '4.075e-05', 'epoch': '0.08153', 'num_input_tokens_seen': 6628186, 'train_runtime': '3353', 'train_tokens_per_second': '1977'} +{'loss': '1.305', 'grad_norm': '3.463', 'learning_rate': '4.076e-05', 'epoch': '0.08155', 'num_input_tokens_seen': 6630233, 'train_runtime': '3354', 'train_tokens_per_second': '1977'} +{'loss': '0.6145', 'grad_norm': '2.622', 'learning_rate': '4.077e-05', 'epoch': '0.08158', 'num_input_tokens_seen': 6632280, 'train_runtime': '3355', 'train_tokens_per_second': '1977'} +{'loss': '0.7022', 'grad_norm': '1.951', 'learning_rate': '4.079e-05', 'epoch': '0.0816', 'num_input_tokens_seen': 6634327, 'train_runtime': '3356', 'train_tokens_per_second': '1977'} +{'loss': '2.555', 'grad_norm': '4.157', 'learning_rate': '4.08e-05', 'epoch': '0.08163', 'num_input_tokens_seen': 6636374, 'train_runtime': '3357', 'train_tokens_per_second': '1977'} +{'loss': '0.9917', 'grad_norm': '2.319', 'learning_rate': '4.081e-05', 'epoch': '0.08165', 'num_input_tokens_seen': 6638421, 'train_runtime': '3358', 'train_tokens_per_second': '1977'} +{'loss': '0.4403', 'grad_norm': '1.592', 'learning_rate': '4.082e-05', 'epoch': '0.08168', 'num_input_tokens_seen': 6640468, 'train_runtime': '3359', 'train_tokens_per_second': '1977'} +{'loss': '0.7002', 'grad_norm': '2.289', 'learning_rate': '4.084e-05', 'epoch': '0.0817', 'num_input_tokens_seen': 6642515, 'train_runtime': '3360', 'train_tokens_per_second': '1977'} +{'loss': '0.598', 'grad_norm': '2.268', 'learning_rate': '4.085e-05', 'epoch': '0.08173', 'num_input_tokens_seen': 6644562, 'train_runtime': '3361', 'train_tokens_per_second': '1977'} +{'loss': '1.402', 'grad_norm': '3.049', 'learning_rate': '4.086e-05', 'epoch': '0.08175', 'num_input_tokens_seen': 6646609, 'train_runtime': '3362', 'train_tokens_per_second': '1977'} +{'loss': '0.3701', 'grad_norm': '1.461', 'learning_rate': '4.087e-05', 'epoch': '0.08178', 'num_input_tokens_seen': 6648656, 'train_runtime': '3363', 'train_tokens_per_second': '1977'} +{'loss': '0.4741', 'grad_norm': '2.074', 'learning_rate': '4.089e-05', 'epoch': '0.0818', 'num_input_tokens_seen': 6650703, 'train_runtime': '3364', 'train_tokens_per_second': '1977'} +{'loss': '0.5775', 'grad_norm': '1.934', 'learning_rate': '4.09e-05', 'epoch': '0.08183', 'num_input_tokens_seen': 6652750, 'train_runtime': '3365', 'train_tokens_per_second': '1977'} +{'loss': '0.4659', 'grad_norm': '1.522', 'learning_rate': '4.091e-05', 'epoch': '0.08185', 'num_input_tokens_seen': 6654797, 'train_runtime': '3366', 'train_tokens_per_second': '1977'} +{'loss': '1.308', 'grad_norm': '2.804', 'learning_rate': '4.092e-05', 'epoch': '0.08188', 'num_input_tokens_seen': 6656844, 'train_runtime': '3367', 'train_tokens_per_second': '1977'} +{'loss': '0.9613', 'grad_norm': '1.567', 'learning_rate': '4.094e-05', 'epoch': '0.0819', 'num_input_tokens_seen': 6658891, 'train_runtime': '3368', 'train_tokens_per_second': '1977'} +{'loss': '0.3532', 'grad_norm': '1.337', 'learning_rate': '4.095e-05', 'epoch': '0.08193', 'num_input_tokens_seen': 6660938, 'train_runtime': '3369', 'train_tokens_per_second': '1977'} +{'loss': '0.6931', 'grad_norm': '2.072', 'learning_rate': '4.096e-05', 'epoch': '0.08195', 'num_input_tokens_seen': 6662985, 'train_runtime': '3370', 'train_tokens_per_second': '1977'} +{'loss': '0.6173', 'grad_norm': '1.955', 'learning_rate': '4.097e-05', 'epoch': '0.08198', 'num_input_tokens_seen': 6665032, 'train_runtime': '3371', 'train_tokens_per_second': '1977'} +{'loss': '0.6921', 'grad_norm': '1.978', 'learning_rate': '4.099e-05', 'epoch': '0.08201', 'num_input_tokens_seen': 6667079, 'train_runtime': '3372', 'train_tokens_per_second': '1977'} +{'loss': '0.5482', 'grad_norm': '1.631', 'learning_rate': '4.1e-05', 'epoch': '0.08203', 'num_input_tokens_seen': 6669126, 'train_runtime': '3373', 'train_tokens_per_second': '1977'} +{'loss': '0.5251', 'grad_norm': '1.844', 'learning_rate': '4.101e-05', 'epoch': '0.08206', 'num_input_tokens_seen': 6671173, 'train_runtime': '3374', 'train_tokens_per_second': '1977'} +{'loss': '0.5675', 'grad_norm': '1.864', 'learning_rate': '4.102e-05', 'epoch': '0.08208', 'num_input_tokens_seen': 6673220, 'train_runtime': '3375', 'train_tokens_per_second': '1977'} +{'loss': '1.057', 'grad_norm': '2.095', 'learning_rate': '4.104e-05', 'epoch': '0.08211', 'num_input_tokens_seen': 6675267, 'train_runtime': '3377', 'train_tokens_per_second': '1977'} +{'loss': '0.3693', 'grad_norm': '1.254', 'learning_rate': '4.105e-05', 'epoch': '0.08213', 'num_input_tokens_seen': 6677314, 'train_runtime': '3378', 'train_tokens_per_second': '1977'} +{'loss': '2.052', 'grad_norm': '3.706', 'learning_rate': '4.106e-05', 'epoch': '0.08216', 'num_input_tokens_seen': 6679361, 'train_runtime': '3379', 'train_tokens_per_second': '1977'} +{'loss': '0.3675', 'grad_norm': '1.766', 'learning_rate': '4.108e-05', 'epoch': '0.08218', 'num_input_tokens_seen': 6681408, 'train_runtime': '3380', 'train_tokens_per_second': '1977'} +{'loss': '0.721', 'grad_norm': '1.833', 'learning_rate': '4.109e-05', 'epoch': '0.08221', 'num_input_tokens_seen': 6683455, 'train_runtime': '3381', 'train_tokens_per_second': '1977'} +{'loss': '0.7814', 'grad_norm': '1.948', 'learning_rate': '4.11e-05', 'epoch': '0.08223', 'num_input_tokens_seen': 6685502, 'train_runtime': '3382', 'train_tokens_per_second': '1977'} +{'loss': '1.448', 'grad_norm': '2.764', 'learning_rate': '4.111e-05', 'epoch': '0.08226', 'num_input_tokens_seen': 6687549, 'train_runtime': '3383', 'train_tokens_per_second': '1977'} +{'loss': '0.4643', 'grad_norm': '1.891', 'learning_rate': '4.113e-05', 'epoch': '0.08228', 'num_input_tokens_seen': 6689596, 'train_runtime': '3384', 'train_tokens_per_second': '1977'} +{'loss': '1.742', 'grad_norm': '2.811', 'learning_rate': '4.114e-05', 'epoch': '0.08231', 'num_input_tokens_seen': 6691643, 'train_runtime': '3385', 'train_tokens_per_second': '1977'} +{'loss': '1.874', 'grad_norm': '3.296', 'learning_rate': '4.115e-05', 'epoch': '0.08233', 'num_input_tokens_seen': 6693690, 'train_runtime': '3386', 'train_tokens_per_second': '1977'} +{'loss': '0.6297', 'grad_norm': '1.876', 'learning_rate': '4.116e-05', 'epoch': '0.08236', 'num_input_tokens_seen': 6695737, 'train_runtime': '3387', 'train_tokens_per_second': '1977'} +{'loss': '0.6471', 'grad_norm': '2.902', 'learning_rate': '4.118e-05', 'epoch': '0.08238', 'num_input_tokens_seen': 6697784, 'train_runtime': '3388', 'train_tokens_per_second': '1977'} +{'loss': '0.9968', 'grad_norm': '2.571', 'learning_rate': '4.119e-05', 'epoch': '0.08241', 'num_input_tokens_seen': 6699831, 'train_runtime': '3389', 'train_tokens_per_second': '1977'} +{'loss': '1.615', 'grad_norm': '2.84', 'learning_rate': '4.12e-05', 'epoch': '0.08243', 'num_input_tokens_seen': 6701878, 'train_runtime': '3390', 'train_tokens_per_second': '1977'} +{'loss': '1.261', 'grad_norm': '2.912', 'learning_rate': '4.121e-05', 'epoch': '0.08246', 'num_input_tokens_seen': 6703925, 'train_runtime': '3391', 'train_tokens_per_second': '1977'} +{'loss': '1.543', 'grad_norm': '4.06', 'learning_rate': '4.123e-05', 'epoch': '0.08248', 'num_input_tokens_seen': 6705972, 'train_runtime': '3392', 'train_tokens_per_second': '1977'} +{'loss': '1.243', 'grad_norm': '2.293', 'learning_rate': '4.124e-05', 'epoch': '0.08251', 'num_input_tokens_seen': 6708019, 'train_runtime': '3393', 'train_tokens_per_second': '1977'} +{'loss': '0.4448', 'grad_norm': '2', 'learning_rate': '4.125e-05', 'epoch': '0.08253', 'num_input_tokens_seen': 6710066, 'train_runtime': '3394', 'train_tokens_per_second': '1977'} +{'loss': '0.4406', 'grad_norm': '1.496', 'learning_rate': '4.126e-05', 'epoch': '0.08256', 'num_input_tokens_seen': 6712113, 'train_runtime': '3395', 'train_tokens_per_second': '1977'} +{'loss': '1.456', 'grad_norm': '3.152', 'learning_rate': '4.128e-05', 'epoch': '0.08258', 'num_input_tokens_seen': 6714160, 'train_runtime': '3396', 'train_tokens_per_second': '1977'} +{'loss': '0.5054', 'grad_norm': '1.724', 'learning_rate': '4.129e-05', 'epoch': '0.08261', 'num_input_tokens_seen': 6716207, 'train_runtime': '3397', 'train_tokens_per_second': '1977'} +{'loss': '0.3807', 'grad_norm': '1.561', 'learning_rate': '4.13e-05', 'epoch': '0.08263', 'num_input_tokens_seen': 6718254, 'train_runtime': '3398', 'train_tokens_per_second': '1977'} +{'loss': '0.4541', 'grad_norm': '1.747', 'learning_rate': '4.131e-05', 'epoch': '0.08266', 'num_input_tokens_seen': 6720301, 'train_runtime': '3399', 'train_tokens_per_second': '1977'} +{'loss': '1.003', 'grad_norm': '1.749', 'learning_rate': '4.133e-05', 'epoch': '0.08268', 'num_input_tokens_seen': 6722348, 'train_runtime': '3400', 'train_tokens_per_second': '1977'} +{'loss': '1.997', 'grad_norm': '2.686', 'learning_rate': '4.134e-05', 'epoch': '0.08271', 'num_input_tokens_seen': 6724395, 'train_runtime': '3401', 'train_tokens_per_second': '1977'} +{'loss': '0.848', 'grad_norm': '2.018', 'learning_rate': '4.135e-05', 'epoch': '0.08274', 'num_input_tokens_seen': 6726442, 'train_runtime': '3402', 'train_tokens_per_second': '1977'} +{'loss': '0.6604', 'grad_norm': '2.024', 'learning_rate': '4.136e-05', 'epoch': '0.08276', 'num_input_tokens_seen': 6728489, 'train_runtime': '3403', 'train_tokens_per_second': '1977'} +{'loss': '0.4632', 'grad_norm': '1.38', 'learning_rate': '4.138e-05', 'epoch': '0.08279', 'num_input_tokens_seen': 6730536, 'train_runtime': '3404', 'train_tokens_per_second': '1977'} +{'loss': '0.3372', 'grad_norm': '1.495', 'learning_rate': '4.139e-05', 'epoch': '0.08281', 'num_input_tokens_seen': 6732583, 'train_runtime': '3405', 'train_tokens_per_second': '1977'} +{'loss': '0.4898', 'grad_norm': '1.649', 'learning_rate': '4.14e-05', 'epoch': '0.08284', 'num_input_tokens_seen': 6734630, 'train_runtime': '3406', 'train_tokens_per_second': '1977'} +{'loss': '0.7212', 'grad_norm': '2.717', 'learning_rate': '4.141e-05', 'epoch': '0.08286', 'num_input_tokens_seen': 6736677, 'train_runtime': '3408', 'train_tokens_per_second': '1977'} +{'loss': '0.9046', 'grad_norm': '2.327', 'learning_rate': '4.143e-05', 'epoch': '0.08289', 'num_input_tokens_seen': 6738724, 'train_runtime': '3409', 'train_tokens_per_second': '1977'} +{'loss': '0.4734', 'grad_norm': '1.911', 'learning_rate': '4.144e-05', 'epoch': '0.08291', 'num_input_tokens_seen': 6740771, 'train_runtime': '3410', 'train_tokens_per_second': '1977'} +{'loss': '0.9386', 'grad_norm': '2.149', 'learning_rate': '4.145e-05', 'epoch': '0.08294', 'num_input_tokens_seen': 6742818, 'train_runtime': '3411', 'train_tokens_per_second': '1977'} +{'loss': '0.6033', 'grad_norm': '1.814', 'learning_rate': '4.147e-05', 'epoch': '0.08296', 'num_input_tokens_seen': 6744865, 'train_runtime': '3412', 'train_tokens_per_second': '1977'} +{'loss': '0.6261', 'grad_norm': '1.685', 'learning_rate': '4.148e-05', 'epoch': '0.08299', 'num_input_tokens_seen': 6746912, 'train_runtime': '3413', 'train_tokens_per_second': '1977'} +{'loss': '0.8862', 'grad_norm': '3.058', 'learning_rate': '4.149e-05', 'epoch': '0.08301', 'num_input_tokens_seen': 6748959, 'train_runtime': '3414', 'train_tokens_per_second': '1977'} +{'loss': '1.487', 'grad_norm': '2.981', 'learning_rate': '4.15e-05', 'epoch': '0.08304', 'num_input_tokens_seen': 6751006, 'train_runtime': '3415', 'train_tokens_per_second': '1977'} +{'loss': '1.494', 'grad_norm': '2.44', 'learning_rate': '4.152e-05', 'epoch': '0.08306', 'num_input_tokens_seen': 6753053, 'train_runtime': '3416', 'train_tokens_per_second': '1977'} +{'loss': '0.467', 'grad_norm': '1.636', 'learning_rate': '4.153e-05', 'epoch': '0.08309', 'num_input_tokens_seen': 6755100, 'train_runtime': '3417', 'train_tokens_per_second': '1977'} +{'loss': '1.843', 'grad_norm': '2.913', 'learning_rate': '4.154e-05', 'epoch': '0.08311', 'num_input_tokens_seen': 6757147, 'train_runtime': '3418', 'train_tokens_per_second': '1977'} +{'loss': '1.318', 'grad_norm': '2.625', 'learning_rate': '4.155e-05', 'epoch': '0.08314', 'num_input_tokens_seen': 6759194, 'train_runtime': '3419', 'train_tokens_per_second': '1977'} +{'loss': '0.89', 'grad_norm': '1.816', 'learning_rate': '4.157e-05', 'epoch': '0.08316', 'num_input_tokens_seen': 6761241, 'train_runtime': '3420', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '2.98', 'learning_rate': '4.158e-05', 'epoch': '0.08319', 'num_input_tokens_seen': 6763288, 'train_runtime': '3421', 'train_tokens_per_second': '1977'} +{'loss': '1.117', 'grad_norm': '2.183', 'learning_rate': '4.159e-05', 'epoch': '0.08321', 'num_input_tokens_seen': 6765335, 'train_runtime': '3422', 'train_tokens_per_second': '1977'} +{'loss': '0.4317', 'grad_norm': '1.727', 'learning_rate': '4.16e-05', 'epoch': '0.08324', 'num_input_tokens_seen': 6767382, 'train_runtime': '3423', 'train_tokens_per_second': '1977'} +{'loss': '0.8344', 'grad_norm': '1.897', 'learning_rate': '4.162e-05', 'epoch': '0.08326', 'num_input_tokens_seen': 6769429, 'train_runtime': '3424', 'train_tokens_per_second': '1977'} +{'loss': '0.3804', 'grad_norm': '1.6', 'learning_rate': '4.163e-05', 'epoch': '0.08329', 'num_input_tokens_seen': 6771476, 'train_runtime': '3425', 'train_tokens_per_second': '1977'} +{'loss': '0.9952', 'grad_norm': '2.452', 'learning_rate': '4.164e-05', 'epoch': '0.08331', 'num_input_tokens_seen': 6773523, 'train_runtime': '3426', 'train_tokens_per_second': '1977'} +{'loss': '1.534', 'grad_norm': '3.278', 'learning_rate': '4.165e-05', 'epoch': '0.08334', 'num_input_tokens_seen': 6775570, 'train_runtime': '3427', 'train_tokens_per_second': '1977'} +{'loss': '0.3399', 'grad_norm': '1.296', 'learning_rate': '4.167e-05', 'epoch': '0.08336', 'num_input_tokens_seen': 6777617, 'train_runtime': '3428', 'train_tokens_per_second': '1977'} +{'loss': '0.9087', 'grad_norm': '2.277', 'learning_rate': '4.168e-05', 'epoch': '0.08339', 'num_input_tokens_seen': 6779664, 'train_runtime': '3429', 'train_tokens_per_second': '1977'} +{'loss': '0.3857', 'grad_norm': '1.274', 'learning_rate': '4.169e-05', 'epoch': '0.08342', 'num_input_tokens_seen': 6781711, 'train_runtime': '3430', 'train_tokens_per_second': '1977'} +{'loss': '0.6148', 'grad_norm': '1.967', 'learning_rate': '4.17e-05', 'epoch': '0.08344', 'num_input_tokens_seen': 6783758, 'train_runtime': '3431', 'train_tokens_per_second': '1977'} +{'loss': '2.231', 'grad_norm': '2.664', 'learning_rate': '4.172e-05', 'epoch': '0.08347', 'num_input_tokens_seen': 6785805, 'train_runtime': '3432', 'train_tokens_per_second': '1977'} +{'loss': '1.591', 'grad_norm': '2.857', 'learning_rate': '4.173e-05', 'epoch': '0.08349', 'num_input_tokens_seen': 6787852, 'train_runtime': '3433', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '2.674', 'learning_rate': '4.174e-05', 'epoch': '0.08352', 'num_input_tokens_seen': 6789899, 'train_runtime': '3434', 'train_tokens_per_second': '1977'} +{'loss': '1.501', 'grad_norm': '2.681', 'learning_rate': '4.175e-05', 'epoch': '0.08354', 'num_input_tokens_seen': 6791946, 'train_runtime': '3435', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '2.581', 'learning_rate': '4.177e-05', 'epoch': '0.08357', 'num_input_tokens_seen': 6793993, 'train_runtime': '3437', 'train_tokens_per_second': '1977'} +{'loss': '1.43', 'grad_norm': '2.88', 'learning_rate': '4.178e-05', 'epoch': '0.08359', 'num_input_tokens_seen': 6796040, 'train_runtime': '3438', 'train_tokens_per_second': '1977'} +{'loss': '0.6063', 'grad_norm': '1.863', 'learning_rate': '4.179e-05', 'epoch': '0.08362', 'num_input_tokens_seen': 6798087, 'train_runtime': '3439', 'train_tokens_per_second': '1977'} +{'loss': '1.135', 'grad_norm': '2.436', 'learning_rate': '4.181e-05', 'epoch': '0.08364', 'num_input_tokens_seen': 6800134, 'train_runtime': '3440', 'train_tokens_per_second': '1977'} +{'loss': '0.3923', 'grad_norm': '2.043', 'learning_rate': '4.182e-05', 'epoch': '0.08367', 'num_input_tokens_seen': 6802181, 'train_runtime': '3441', 'train_tokens_per_second': '1977'} +{'loss': '0.793', 'grad_norm': '2.053', 'learning_rate': '4.183e-05', 'epoch': '0.08369', 'num_input_tokens_seen': 6804228, 'train_runtime': '3442', 'train_tokens_per_second': '1977'} +{'loss': '0.8013', 'grad_norm': '1.744', 'learning_rate': '4.184e-05', 'epoch': '0.08372', 'num_input_tokens_seen': 6806275, 'train_runtime': '3443', 'train_tokens_per_second': '1977'} +{'loss': '0.3324', 'grad_norm': '1.594', 'learning_rate': '4.186e-05', 'epoch': '0.08374', 'num_input_tokens_seen': 6808322, 'train_runtime': '3444', 'train_tokens_per_second': '1977'} +{'loss': '0.8035', 'grad_norm': '2.309', 'learning_rate': '4.187e-05', 'epoch': '0.08377', 'num_input_tokens_seen': 6810369, 'train_runtime': '3445', 'train_tokens_per_second': '1977'} +{'loss': '0.7751', 'grad_norm': '2.342', 'learning_rate': '4.188e-05', 'epoch': '0.08379', 'num_input_tokens_seen': 6812416, 'train_runtime': '3446', 'train_tokens_per_second': '1977'} +{'loss': '1.767', 'grad_norm': '3.566', 'learning_rate': '4.189e-05', 'epoch': '0.08382', 'num_input_tokens_seen': 6814463, 'train_runtime': '3447', 'train_tokens_per_second': '1977'} +{'loss': '2.131', 'grad_norm': '3.336', 'learning_rate': '4.191e-05', 'epoch': '0.08384', 'num_input_tokens_seen': 6816510, 'train_runtime': '3448', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '1.996', 'learning_rate': '4.192e-05', 'epoch': '0.08387', 'num_input_tokens_seen': 6818557, 'train_runtime': '3449', 'train_tokens_per_second': '1977'} +{'loss': '2.094', 'grad_norm': '2.961', 'learning_rate': '4.193e-05', 'epoch': '0.08389', 'num_input_tokens_seen': 6820604, 'train_runtime': '3450', 'train_tokens_per_second': '1977'} +{'loss': '0.4223', 'grad_norm': '1.412', 'learning_rate': '4.194e-05', 'epoch': '0.08392', 'num_input_tokens_seen': 6822651, 'train_runtime': '3451', 'train_tokens_per_second': '1977'} +{'loss': '0.8748', 'grad_norm': '2.211', 'learning_rate': '4.196e-05', 'epoch': '0.08394', 'num_input_tokens_seen': 6824698, 'train_runtime': '3452', 'train_tokens_per_second': '1977'} +{'loss': '2.473', 'grad_norm': '2.698', 'learning_rate': '4.197e-05', 'epoch': '0.08397', 'num_input_tokens_seen': 6826745, 'train_runtime': '3453', 'train_tokens_per_second': '1977'} +{'loss': '0.3254', 'grad_norm': '1.498', 'learning_rate': '4.198e-05', 'epoch': '0.08399', 'num_input_tokens_seen': 6828792, 'train_runtime': '3454', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '1.742', 'learning_rate': '4.199e-05', 'epoch': '0.08402', 'num_input_tokens_seen': 6830839, 'train_runtime': '3455', 'train_tokens_per_second': '1977'} +{'loss': '1.628', 'grad_norm': '3.191', 'learning_rate': '4.201e-05', 'epoch': '0.08404', 'num_input_tokens_seen': 6832886, 'train_runtime': '3456', 'train_tokens_per_second': '1977'} +{'loss': '0.3716', 'grad_norm': '1.516', 'learning_rate': '4.202e-05', 'epoch': '0.08407', 'num_input_tokens_seen': 6834933, 'train_runtime': '3457', 'train_tokens_per_second': '1977'} +{'loss': '1.963', 'grad_norm': '2.535', 'learning_rate': '4.203e-05', 'epoch': '0.08409', 'num_input_tokens_seen': 6836980, 'train_runtime': '3458', 'train_tokens_per_second': '1977'} +{'loss': '0.9314', 'grad_norm': '2.357', 'learning_rate': '4.204e-05', 'epoch': '0.08412', 'num_input_tokens_seen': 6839027, 'train_runtime': '3459', 'train_tokens_per_second': '1977'} +{'loss': '0.4267', 'grad_norm': '1.696', 'learning_rate': '4.206e-05', 'epoch': '0.08415', 'num_input_tokens_seen': 6841074, 'train_runtime': '3460', 'train_tokens_per_second': '1977'} +{'loss': '1.003', 'grad_norm': '2.504', 'learning_rate': '4.207e-05', 'epoch': '0.08417', 'num_input_tokens_seen': 6843121, 'train_runtime': '3461', 'train_tokens_per_second': '1977'} +{'loss': '0.3552', 'grad_norm': '1.353', 'learning_rate': '4.208e-05', 'epoch': '0.0842', 'num_input_tokens_seen': 6845168, 'train_runtime': '3462', 'train_tokens_per_second': '1977'} +{'loss': '0.8301', 'grad_norm': '2.249', 'learning_rate': '4.209e-05', 'epoch': '0.08422', 'num_input_tokens_seen': 6847215, 'train_runtime': '3463', 'train_tokens_per_second': '1977'} +{'loss': '0.8301', 'grad_norm': '2.178', 'learning_rate': '4.211e-05', 'epoch': '0.08425', 'num_input_tokens_seen': 6849262, 'train_runtime': '3464', 'train_tokens_per_second': '1977'} +{'loss': '1.615', 'grad_norm': '2.498', 'learning_rate': '4.212e-05', 'epoch': '0.08427', 'num_input_tokens_seen': 6851309, 'train_runtime': '3465', 'train_tokens_per_second': '1977'} +{'loss': '2.434', 'grad_norm': '3.068', 'learning_rate': '4.213e-05', 'epoch': '0.0843', 'num_input_tokens_seen': 6853356, 'train_runtime': '3466', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '2.444', 'learning_rate': '4.215e-05', 'epoch': '0.08432', 'num_input_tokens_seen': 6855403, 'train_runtime': '3468', 'train_tokens_per_second': '1977'} +{'loss': '0.518', 'grad_norm': '1.591', 'learning_rate': '4.216e-05', 'epoch': '0.08435', 'num_input_tokens_seen': 6857450, 'train_runtime': '3469', 'train_tokens_per_second': '1977'} +{'loss': '0.4529', 'grad_norm': '1.514', 'learning_rate': '4.217e-05', 'epoch': '0.08437', 'num_input_tokens_seen': 6859497, 'train_runtime': '3470', 'train_tokens_per_second': '1977'} +{'loss': '0.5861', 'grad_norm': '1.972', 'learning_rate': '4.218e-05', 'epoch': '0.0844', 'num_input_tokens_seen': 6861544, 'train_runtime': '3471', 'train_tokens_per_second': '1977'} +{'loss': '0.673', 'grad_norm': '2.377', 'learning_rate': '4.22e-05', 'epoch': '0.08442', 'num_input_tokens_seen': 6863591, 'train_runtime': '3472', 'train_tokens_per_second': '1977'} +{'loss': '2.021', 'grad_norm': '3.049', 'learning_rate': '4.221e-05', 'epoch': '0.08445', 'num_input_tokens_seen': 6865638, 'train_runtime': '3473', 'train_tokens_per_second': '1977'} +{'loss': '0.8487', 'grad_norm': '2.216', 'learning_rate': '4.222e-05', 'epoch': '0.08447', 'num_input_tokens_seen': 6867685, 'train_runtime': '3474', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '2.37', 'learning_rate': '4.223e-05', 'epoch': '0.0845', 'num_input_tokens_seen': 6869732, 'train_runtime': '3475', 'train_tokens_per_second': '1977'} +{'loss': '0.9131', 'grad_norm': '2.729', 'learning_rate': '4.225e-05', 'epoch': '0.08452', 'num_input_tokens_seen': 6871779, 'train_runtime': '3476', 'train_tokens_per_second': '1977'} +{'loss': '0.8914', 'grad_norm': '1.878', 'learning_rate': '4.226e-05', 'epoch': '0.08455', 'num_input_tokens_seen': 6873826, 'train_runtime': '3477', 'train_tokens_per_second': '1977'} +{'loss': '1.435', 'grad_norm': '3.095', 'learning_rate': '4.227e-05', 'epoch': '0.08457', 'num_input_tokens_seen': 6875873, 'train_runtime': '3478', 'train_tokens_per_second': '1977'} +{'loss': '0.9092', 'grad_norm': '1.891', 'learning_rate': '4.228e-05', 'epoch': '0.0846', 'num_input_tokens_seen': 6877920, 'train_runtime': '3479', 'train_tokens_per_second': '1977'} +{'loss': '1.784', 'grad_norm': '2.851', 'learning_rate': '4.23e-05', 'epoch': '0.08462', 'num_input_tokens_seen': 6879967, 'train_runtime': '3480', 'train_tokens_per_second': '1977'} +{'loss': '1.36', 'grad_norm': '2.613', 'learning_rate': '4.231e-05', 'epoch': '0.08465', 'num_input_tokens_seen': 6882014, 'train_runtime': '3481', 'train_tokens_per_second': '1977'} +{'loss': '0.4472', 'grad_norm': '1.432', 'learning_rate': '4.232e-05', 'epoch': '0.08467', 'num_input_tokens_seen': 6884061, 'train_runtime': '3482', 'train_tokens_per_second': '1977'} +{'loss': '1.321', 'grad_norm': '2.555', 'learning_rate': '4.233e-05', 'epoch': '0.0847', 'num_input_tokens_seen': 6886108, 'train_runtime': '3483', 'train_tokens_per_second': '1977'} +{'loss': '0.6017', 'grad_norm': '1.761', 'learning_rate': '4.235e-05', 'epoch': '0.08472', 'num_input_tokens_seen': 6888155, 'train_runtime': '3484', 'train_tokens_per_second': '1977'} +{'loss': '0.5303', 'grad_norm': '1.737', 'learning_rate': '4.236e-05', 'epoch': '0.08475', 'num_input_tokens_seen': 6890202, 'train_runtime': '3485', 'train_tokens_per_second': '1977'} +{'loss': '0.4823', 'grad_norm': '1.511', 'learning_rate': '4.237e-05', 'epoch': '0.08477', 'num_input_tokens_seen': 6892249, 'train_runtime': '3486', 'train_tokens_per_second': '1977'} +{'loss': '1.536', 'grad_norm': '2.668', 'learning_rate': '4.238e-05', 'epoch': '0.0848', 'num_input_tokens_seen': 6894296, 'train_runtime': '3487', 'train_tokens_per_second': '1977'} +{'loss': '0.6745', 'grad_norm': '2.14', 'learning_rate': '4.24e-05', 'epoch': '0.08483', 'num_input_tokens_seen': 6896343, 'train_runtime': '3488', 'train_tokens_per_second': '1977'} +{'loss': '0.5028', 'grad_norm': '1.678', 'learning_rate': '4.241e-05', 'epoch': '0.08485', 'num_input_tokens_seen': 6898390, 'train_runtime': '3489', 'train_tokens_per_second': '1977'} +{'loss': '0.6021', 'grad_norm': '2.105', 'learning_rate': '4.242e-05', 'epoch': '0.08488', 'num_input_tokens_seen': 6900437, 'train_runtime': '3490', 'train_tokens_per_second': '1977'} +{'loss': '0.4375', 'grad_norm': '1.617', 'learning_rate': '4.243e-05', 'epoch': '0.0849', 'num_input_tokens_seen': 6902484, 'train_runtime': '3491', 'train_tokens_per_second': '1977'} +{'loss': '0.5073', 'grad_norm': '1.751', 'learning_rate': '4.245e-05', 'epoch': '0.08493', 'num_input_tokens_seen': 6904531, 'train_runtime': '3492', 'train_tokens_per_second': '1977'} +{'loss': '1.687', 'grad_norm': '3.436', 'learning_rate': '4.246e-05', 'epoch': '0.08495', 'num_input_tokens_seen': 6906578, 'train_runtime': '3493', 'train_tokens_per_second': '1977'} +{'loss': '0.5562', 'grad_norm': '1.871', 'learning_rate': '4.247e-05', 'epoch': '0.08498', 'num_input_tokens_seen': 6908625, 'train_runtime': '3494', 'train_tokens_per_second': '1977'} +{'loss': '0.499', 'grad_norm': '1.475', 'learning_rate': '4.248e-05', 'epoch': '0.085', 'num_input_tokens_seen': 6910672, 'train_runtime': '3495', 'train_tokens_per_second': '1977'} +{'loss': '0.5391', 'grad_norm': '1.976', 'learning_rate': '4.25e-05', 'epoch': '0.08503', 'num_input_tokens_seen': 6912719, 'train_runtime': '3496', 'train_tokens_per_second': '1977'} +{'loss': '0.8316', 'grad_norm': '1.968', 'learning_rate': '4.251e-05', 'epoch': '0.08505', 'num_input_tokens_seen': 6914766, 'train_runtime': '3497', 'train_tokens_per_second': '1977'} +{'loss': '0.7973', 'grad_norm': '2.133', 'learning_rate': '4.252e-05', 'epoch': '0.08508', 'num_input_tokens_seen': 6916813, 'train_runtime': '3498', 'train_tokens_per_second': '1977'} +{'loss': '1.481', 'grad_norm': '2.807', 'learning_rate': '4.254e-05', 'epoch': '0.0851', 'num_input_tokens_seen': 6918860, 'train_runtime': '3500', 'train_tokens_per_second': '1977'} +{'loss': '0.8681', 'grad_norm': '2.402', 'learning_rate': '4.255e-05', 'epoch': '0.08513', 'num_input_tokens_seen': 6920907, 'train_runtime': '3501', 'train_tokens_per_second': '1977'} +{'loss': '2.561', 'grad_norm': '3.517', 'learning_rate': '4.256e-05', 'epoch': '0.08515', 'num_input_tokens_seen': 6922954, 'train_runtime': '3502', 'train_tokens_per_second': '1977'} +{'loss': '1.177', 'grad_norm': '2.66', 'learning_rate': '4.257e-05', 'epoch': '0.08518', 'num_input_tokens_seen': 6925001, 'train_runtime': '3503', 'train_tokens_per_second': '1977'} +{'loss': '1.31', 'grad_norm': '2.535', 'learning_rate': '4.259e-05', 'epoch': '0.0852', 'num_input_tokens_seen': 6927048, 'train_runtime': '3504', 'train_tokens_per_second': '1977'} +{'loss': '1.755', 'grad_norm': '2.851', 'learning_rate': '4.26e-05', 'epoch': '0.08523', 'num_input_tokens_seen': 6929095, 'train_runtime': '3505', 'train_tokens_per_second': '1977'} +{'loss': '0.9109', 'grad_norm': '1.964', 'learning_rate': '4.261e-05', 'epoch': '0.08525', 'num_input_tokens_seen': 6931142, 'train_runtime': '3506', 'train_tokens_per_second': '1977'} +{'loss': '0.4086', 'grad_norm': '1.404', 'learning_rate': '4.262e-05', 'epoch': '0.08528', 'num_input_tokens_seen': 6933189, 'train_runtime': '3507', 'train_tokens_per_second': '1977'} +{'loss': '0.3575', 'grad_norm': '1.664', 'learning_rate': '4.264e-05', 'epoch': '0.0853', 'num_input_tokens_seen': 6935236, 'train_runtime': '3508', 'train_tokens_per_second': '1977'} +{'loss': '0.7975', 'grad_norm': '2.099', 'learning_rate': '4.265e-05', 'epoch': '0.08533', 'num_input_tokens_seen': 6937283, 'train_runtime': '3509', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '2.771', 'learning_rate': '4.266e-05', 'epoch': '0.08535', 'num_input_tokens_seen': 6939330, 'train_runtime': '3510', 'train_tokens_per_second': '1977'} +{'loss': '0.8882', 'grad_norm': '2.771', 'learning_rate': '4.267e-05', 'epoch': '0.08538', 'num_input_tokens_seen': 6941377, 'train_runtime': '3511', 'train_tokens_per_second': '1977'} +{'loss': '0.7543', 'grad_norm': '1.705', 'learning_rate': '4.269e-05', 'epoch': '0.0854', 'num_input_tokens_seen': 6943424, 'train_runtime': '3512', 'train_tokens_per_second': '1977'} +{'loss': '0.4508', 'grad_norm': '1.553', 'learning_rate': '4.27e-05', 'epoch': '0.08543', 'num_input_tokens_seen': 6945471, 'train_runtime': '3513', 'train_tokens_per_second': '1977'} +{'loss': '0.7869', 'grad_norm': '1.839', 'learning_rate': '4.271e-05', 'epoch': '0.08545', 'num_input_tokens_seen': 6947518, 'train_runtime': '3514', 'train_tokens_per_second': '1977'} +{'loss': '0.3785', 'grad_norm': '1.632', 'learning_rate': '4.272e-05', 'epoch': '0.08548', 'num_input_tokens_seen': 6949565, 'train_runtime': '3515', 'train_tokens_per_second': '1977'} +{'loss': '0.6886', 'grad_norm': '2.52', 'learning_rate': '4.274e-05', 'epoch': '0.0855', 'num_input_tokens_seen': 6951612, 'train_runtime': '3516', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '1.769', 'learning_rate': '4.275e-05', 'epoch': '0.08553', 'num_input_tokens_seen': 6953659, 'train_runtime': '3517', 'train_tokens_per_second': '1977'} +{'loss': '0.9323', 'grad_norm': '2.631', 'learning_rate': '4.276e-05', 'epoch': '0.08556', 'num_input_tokens_seen': 6955706, 'train_runtime': '3518', 'train_tokens_per_second': '1977'} +{'loss': '0.5754', 'grad_norm': '2.194', 'learning_rate': '4.277e-05', 'epoch': '0.08558', 'num_input_tokens_seen': 6957753, 'train_runtime': '3519', 'train_tokens_per_second': '1977'} +{'loss': '0.9032', 'grad_norm': '2.238', 'learning_rate': '4.279e-05', 'epoch': '0.08561', 'num_input_tokens_seen': 6959800, 'train_runtime': '3520', 'train_tokens_per_second': '1977'} +{'loss': '0.4231', 'grad_norm': '2.192', 'learning_rate': '4.28e-05', 'epoch': '0.08563', 'num_input_tokens_seen': 6961847, 'train_runtime': '3521', 'train_tokens_per_second': '1977'} +{'loss': '0.8933', 'grad_norm': '2.157', 'learning_rate': '4.281e-05', 'epoch': '0.08566', 'num_input_tokens_seen': 6963894, 'train_runtime': '3522', 'train_tokens_per_second': '1977'} +{'loss': '0.4718', 'grad_norm': '1.784', 'learning_rate': '4.282e-05', 'epoch': '0.08568', 'num_input_tokens_seen': 6965941, 'train_runtime': '3523', 'train_tokens_per_second': '1977'} +{'loss': '0.8166', 'grad_norm': '1.948', 'learning_rate': '4.284e-05', 'epoch': '0.08571', 'num_input_tokens_seen': 6967988, 'train_runtime': '3524', 'train_tokens_per_second': '1977'} +{'loss': '1.039', 'grad_norm': '2.341', 'learning_rate': '4.285e-05', 'epoch': '0.08573', 'num_input_tokens_seen': 6970035, 'train_runtime': '3525', 'train_tokens_per_second': '1977'} +{'loss': '1.599', 'grad_norm': '3.323', 'learning_rate': '4.286e-05', 'epoch': '0.08576', 'num_input_tokens_seen': 6972082, 'train_runtime': '3526', 'train_tokens_per_second': '1977'} +{'loss': '1.182', 'grad_norm': '2.223', 'learning_rate': '4.288e-05', 'epoch': '0.08578', 'num_input_tokens_seen': 6974129, 'train_runtime': '3527', 'train_tokens_per_second': '1977'} +{'loss': '0.7224', 'grad_norm': '2.037', 'learning_rate': '4.289e-05', 'epoch': '0.08581', 'num_input_tokens_seen': 6976176, 'train_runtime': '3528', 'train_tokens_per_second': '1977'} +{'loss': '2.173', 'grad_norm': '5.255', 'learning_rate': '4.29e-05', 'epoch': '0.08583', 'num_input_tokens_seen': 6978223, 'train_runtime': '3530', 'train_tokens_per_second': '1977'} +{'loss': '0.8247', 'grad_norm': '1.863', 'learning_rate': '4.291e-05', 'epoch': '0.08586', 'num_input_tokens_seen': 6980270, 'train_runtime': '3531', 'train_tokens_per_second': '1977'} +{'loss': '0.7625', 'grad_norm': '2.285', 'learning_rate': '4.293e-05', 'epoch': '0.08588', 'num_input_tokens_seen': 6982317, 'train_runtime': '3532', 'train_tokens_per_second': '1977'} +{'loss': '1.857', 'grad_norm': '2.705', 'learning_rate': '4.294e-05', 'epoch': '0.08591', 'num_input_tokens_seen': 6984364, 'train_runtime': '3533', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '2.293', 'learning_rate': '4.295e-05', 'epoch': '0.08593', 'num_input_tokens_seen': 6986411, 'train_runtime': '3534', 'train_tokens_per_second': '1977'} +{'loss': '1.53', 'grad_norm': '2.81', 'learning_rate': '4.296e-05', 'epoch': '0.08596', 'num_input_tokens_seen': 6988458, 'train_runtime': '3535', 'train_tokens_per_second': '1977'} +{'loss': '0.4089', 'grad_norm': '1.584', 'learning_rate': '4.298e-05', 'epoch': '0.08598', 'num_input_tokens_seen': 6990505, 'train_runtime': '3536', 'train_tokens_per_second': '1977'} +{'loss': '1.182', 'grad_norm': '2.247', 'learning_rate': '4.299e-05', 'epoch': '0.08601', 'num_input_tokens_seen': 6992552, 'train_runtime': '3537', 'train_tokens_per_second': '1977'} +{'loss': '0.4081', 'grad_norm': '1.257', 'learning_rate': '4.3e-05', 'epoch': '0.08603', 'num_input_tokens_seen': 6994599, 'train_runtime': '3538', 'train_tokens_per_second': '1977'} +{'loss': '0.8969', 'grad_norm': '2.49', 'learning_rate': '4.301e-05', 'epoch': '0.08606', 'num_input_tokens_seen': 6996646, 'train_runtime': '3539', 'train_tokens_per_second': '1977'} +{'loss': '0.4294', 'grad_norm': '1.749', 'learning_rate': '4.303e-05', 'epoch': '0.08608', 'num_input_tokens_seen': 6998693, 'train_runtime': '3540', 'train_tokens_per_second': '1977'} +{'loss': '1.684', 'grad_norm': '3.05', 'learning_rate': '4.304e-05', 'epoch': '0.08611', 'num_input_tokens_seen': 7000740, 'train_runtime': '3541', 'train_tokens_per_second': '1977'} +{'loss': '0.9891', 'grad_norm': '1.925', 'learning_rate': '4.305e-05', 'epoch': '0.08613', 'num_input_tokens_seen': 7002787, 'train_runtime': '3542', 'train_tokens_per_second': '1977'} +{'loss': '0.9591', 'grad_norm': '2.139', 'learning_rate': '4.306e-05', 'epoch': '0.08616', 'num_input_tokens_seen': 7004834, 'train_runtime': '3543', 'train_tokens_per_second': '1977'} +{'loss': '0.702', 'grad_norm': '2.292', 'learning_rate': '4.308e-05', 'epoch': '0.08618', 'num_input_tokens_seen': 7006881, 'train_runtime': '3544', 'train_tokens_per_second': '1977'} +{'loss': '0.9802', 'grad_norm': '2.096', 'learning_rate': '4.309e-05', 'epoch': '0.08621', 'num_input_tokens_seen': 7008928, 'train_runtime': '3545', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '2.51', 'learning_rate': '4.31e-05', 'epoch': '0.08624', 'num_input_tokens_seen': 7010975, 'train_runtime': '3546', 'train_tokens_per_second': '1977'} +{'loss': '0.4684', 'grad_norm': '1.749', 'learning_rate': '4.311e-05', 'epoch': '0.08626', 'num_input_tokens_seen': 7013022, 'train_runtime': '3547', 'train_tokens_per_second': '1977'} +{'loss': '1.13', 'grad_norm': '2.348', 'learning_rate': '4.313e-05', 'epoch': '0.08629', 'num_input_tokens_seen': 7015069, 'train_runtime': '3548', 'train_tokens_per_second': '1977'} +{'loss': '1.067', 'grad_norm': '2.346', 'learning_rate': '4.314e-05', 'epoch': '0.08631', 'num_input_tokens_seen': 7017116, 'train_runtime': '3549', 'train_tokens_per_second': '1977'} +{'loss': '1.411', 'grad_norm': '2.303', 'learning_rate': '4.315e-05', 'epoch': '0.08634', 'num_input_tokens_seen': 7019163, 'train_runtime': '3550', 'train_tokens_per_second': '1977'} +{'loss': '1.17', 'grad_norm': '2.358', 'learning_rate': '4.316e-05', 'epoch': '0.08636', 'num_input_tokens_seen': 7021210, 'train_runtime': '3551', 'train_tokens_per_second': '1977'} +{'loss': '1.362', 'grad_norm': '2.475', 'learning_rate': '4.318e-05', 'epoch': '0.08639', 'num_input_tokens_seen': 7023257, 'train_runtime': '3552', 'train_tokens_per_second': '1977'} +{'loss': '2.497', 'grad_norm': '3.161', 'learning_rate': '4.319e-05', 'epoch': '0.08641', 'num_input_tokens_seen': 7025304, 'train_runtime': '3553', 'train_tokens_per_second': '1977'} +{'loss': '1.039', 'grad_norm': '2.975', 'learning_rate': '4.32e-05', 'epoch': '0.08644', 'num_input_tokens_seen': 7027351, 'train_runtime': '3554', 'train_tokens_per_second': '1977'} +{'loss': '1.374', 'grad_norm': '2.574', 'learning_rate': '4.322e-05', 'epoch': '0.08646', 'num_input_tokens_seen': 7029398, 'train_runtime': '3555', 'train_tokens_per_second': '1977'} +{'loss': '0.4337', 'grad_norm': '1.459', 'learning_rate': '4.323e-05', 'epoch': '0.08649', 'num_input_tokens_seen': 7031445, 'train_runtime': '3556', 'train_tokens_per_second': '1977'} +{'loss': '0.9173', 'grad_norm': '2.673', 'learning_rate': '4.324e-05', 'epoch': '0.08651', 'num_input_tokens_seen': 7033492, 'train_runtime': '3557', 'train_tokens_per_second': '1977'} +{'loss': '0.3801', 'grad_norm': '1.601', 'learning_rate': '4.325e-05', 'epoch': '0.08654', 'num_input_tokens_seen': 7035539, 'train_runtime': '3558', 'train_tokens_per_second': '1977'} +{'loss': '0.361', 'grad_norm': '1.585', 'learning_rate': '4.327e-05', 'epoch': '0.08656', 'num_input_tokens_seen': 7037586, 'train_runtime': '3560', 'train_tokens_per_second': '1977'} +{'loss': '0.8167', 'grad_norm': '1.613', 'learning_rate': '4.328e-05', 'epoch': '0.08659', 'num_input_tokens_seen': 7039633, 'train_runtime': '3561', 'train_tokens_per_second': '1977'} +{'loss': '0.9054', 'grad_norm': '2.181', 'learning_rate': '4.329e-05', 'epoch': '0.08661', 'num_input_tokens_seen': 7041680, 'train_runtime': '3562', 'train_tokens_per_second': '1977'} +{'loss': '1.276', 'grad_norm': '2.404', 'learning_rate': '4.33e-05', 'epoch': '0.08664', 'num_input_tokens_seen': 7043727, 'train_runtime': '3563', 'train_tokens_per_second': '1977'} +{'loss': '1.626', 'grad_norm': '3.031', 'learning_rate': '4.332e-05', 'epoch': '0.08666', 'num_input_tokens_seen': 7045774, 'train_runtime': '3564', 'train_tokens_per_second': '1977'} +{'loss': '0.6274', 'grad_norm': '1.755', 'learning_rate': '4.333e-05', 'epoch': '0.08669', 'num_input_tokens_seen': 7047821, 'train_runtime': '3565', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '2.587', 'learning_rate': '4.334e-05', 'epoch': '0.08671', 'num_input_tokens_seen': 7049868, 'train_runtime': '3566', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '2.269', 'learning_rate': '4.335e-05', 'epoch': '0.08674', 'num_input_tokens_seen': 7051915, 'train_runtime': '3567', 'train_tokens_per_second': '1977'} +{'loss': '0.5705', 'grad_norm': '1.722', 'learning_rate': '4.337e-05', 'epoch': '0.08676', 'num_input_tokens_seen': 7053962, 'train_runtime': '3568', 'train_tokens_per_second': '1977'} +{'loss': '1.759', 'grad_norm': '2.759', 'learning_rate': '4.338e-05', 'epoch': '0.08679', 'num_input_tokens_seen': 7056009, 'train_runtime': '3569', 'train_tokens_per_second': '1977'} +{'loss': '0.6116', 'grad_norm': '1.843', 'learning_rate': '4.339e-05', 'epoch': '0.08681', 'num_input_tokens_seen': 7058056, 'train_runtime': '3570', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '2.157', 'learning_rate': '4.34e-05', 'epoch': '0.08684', 'num_input_tokens_seen': 7060103, 'train_runtime': '3571', 'train_tokens_per_second': '1977'} +{'loss': '0.4117', 'grad_norm': '1.753', 'learning_rate': '4.342e-05', 'epoch': '0.08686', 'num_input_tokens_seen': 7062150, 'train_runtime': '3572', 'train_tokens_per_second': '1977'} +{'loss': '1.534', 'grad_norm': '2.63', 'learning_rate': '4.343e-05', 'epoch': '0.08689', 'num_input_tokens_seen': 7064197, 'train_runtime': '3573', 'train_tokens_per_second': '1977'} +{'loss': '1.423', 'grad_norm': '2.513', 'learning_rate': '4.344e-05', 'epoch': '0.08691', 'num_input_tokens_seen': 7066244, 'train_runtime': '3574', 'train_tokens_per_second': '1977'} +{'loss': '0.4405', 'grad_norm': '1.519', 'learning_rate': '4.345e-05', 'epoch': '0.08694', 'num_input_tokens_seen': 7068291, 'train_runtime': '3575', 'train_tokens_per_second': '1977'} +{'loss': '0.7317', 'grad_norm': '1.492', 'learning_rate': '4.347e-05', 'epoch': '0.08697', 'num_input_tokens_seen': 7070338, 'train_runtime': '3576', 'train_tokens_per_second': '1977'} +{'loss': '0.4252', 'grad_norm': '1.638', 'learning_rate': '4.348e-05', 'epoch': '0.08699', 'num_input_tokens_seen': 7072385, 'train_runtime': '3577', 'train_tokens_per_second': '1977'} +{'loss': '0.9839', 'grad_norm': '2.134', 'learning_rate': '4.349e-05', 'epoch': '0.08702', 'num_input_tokens_seen': 7074432, 'train_runtime': '3578', 'train_tokens_per_second': '1977'} +{'loss': '0.5676', 'grad_norm': '2.041', 'learning_rate': '4.35e-05', 'epoch': '0.08704', 'num_input_tokens_seen': 7076479, 'train_runtime': '3579', 'train_tokens_per_second': '1977'} +{'loss': '0.87', 'grad_norm': '2.223', 'learning_rate': '4.352e-05', 'epoch': '0.08707', 'num_input_tokens_seen': 7078526, 'train_runtime': '3580', 'train_tokens_per_second': '1977'} +{'loss': '0.4736', 'grad_norm': '1.994', 'learning_rate': '4.353e-05', 'epoch': '0.08709', 'num_input_tokens_seen': 7080573, 'train_runtime': '3581', 'train_tokens_per_second': '1977'} +{'loss': '2.59', 'grad_norm': '3.813', 'learning_rate': '4.354e-05', 'epoch': '0.08712', 'num_input_tokens_seen': 7082620, 'train_runtime': '3582', 'train_tokens_per_second': '1977'} +{'loss': '0.7288', 'grad_norm': '1.508', 'learning_rate': '4.355e-05', 'epoch': '0.08714', 'num_input_tokens_seen': 7084667, 'train_runtime': '3583', 'train_tokens_per_second': '1977'} +{'loss': '0.8904', 'grad_norm': '2.194', 'learning_rate': '4.357e-05', 'epoch': '0.08717', 'num_input_tokens_seen': 7086714, 'train_runtime': '3584', 'train_tokens_per_second': '1977'} +{'loss': '0.5312', 'grad_norm': '2.206', 'learning_rate': '4.358e-05', 'epoch': '0.08719', 'num_input_tokens_seen': 7088761, 'train_runtime': '3585', 'train_tokens_per_second': '1977'} +{'loss': '0.4368', 'grad_norm': '1.837', 'learning_rate': '4.359e-05', 'epoch': '0.08722', 'num_input_tokens_seen': 7090808, 'train_runtime': '3586', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '2.658', 'learning_rate': '4.361e-05', 'epoch': '0.08724', 'num_input_tokens_seen': 7092855, 'train_runtime': '3587', 'train_tokens_per_second': '1977'} +{'loss': '0.7487', 'grad_norm': '2.078', 'learning_rate': '4.362e-05', 'epoch': '0.08727', 'num_input_tokens_seen': 7094902, 'train_runtime': '3588', 'train_tokens_per_second': '1977'} +{'loss': '0.8172', 'grad_norm': '2.589', 'learning_rate': '4.363e-05', 'epoch': '0.08729', 'num_input_tokens_seen': 7096949, 'train_runtime': '3590', 'train_tokens_per_second': '1977'} +{'loss': '1.102', 'grad_norm': '2.493', 'learning_rate': '4.364e-05', 'epoch': '0.08732', 'num_input_tokens_seen': 7098996, 'train_runtime': '3591', 'train_tokens_per_second': '1977'} +{'loss': '1.234', 'grad_norm': '2.687', 'learning_rate': '4.366e-05', 'epoch': '0.08734', 'num_input_tokens_seen': 7101043, 'train_runtime': '3592', 'train_tokens_per_second': '1977'} +{'loss': '0.9604', 'grad_norm': '2.342', 'learning_rate': '4.367e-05', 'epoch': '0.08737', 'num_input_tokens_seen': 7103090, 'train_runtime': '3593', 'train_tokens_per_second': '1977'} +{'loss': '0.3907', 'grad_norm': '1.62', 'learning_rate': '4.368e-05', 'epoch': '0.08739', 'num_input_tokens_seen': 7105137, 'train_runtime': '3594', 'train_tokens_per_second': '1977'} +{'loss': '1.218', 'grad_norm': '2.905', 'learning_rate': '4.369e-05', 'epoch': '0.08742', 'num_input_tokens_seen': 7107184, 'train_runtime': '3595', 'train_tokens_per_second': '1977'} +{'loss': '0.8237', 'grad_norm': '2.12', 'learning_rate': '4.371e-05', 'epoch': '0.08744', 'num_input_tokens_seen': 7109231, 'train_runtime': '3596', 'train_tokens_per_second': '1977'} +{'loss': '1.181', 'grad_norm': '2.408', 'learning_rate': '4.372e-05', 'epoch': '0.08747', 'num_input_tokens_seen': 7111278, 'train_runtime': '3597', 'train_tokens_per_second': '1977'} +{'loss': '1.826', 'grad_norm': '3.634', 'learning_rate': '4.373e-05', 'epoch': '0.08749', 'num_input_tokens_seen': 7113325, 'train_runtime': '3598', 'train_tokens_per_second': '1977'} +{'loss': '0.4233', 'grad_norm': '1.516', 'learning_rate': '4.374e-05', 'epoch': '0.08752', 'num_input_tokens_seen': 7115372, 'train_runtime': '3599', 'train_tokens_per_second': '1977'} +{'loss': '0.5175', 'grad_norm': '1.803', 'learning_rate': '4.376e-05', 'epoch': '0.08754', 'num_input_tokens_seen': 7117419, 'train_runtime': '3600', 'train_tokens_per_second': '1977'} +{'loss': '0.526', 'grad_norm': '1.551', 'learning_rate': '4.377e-05', 'epoch': '0.08757', 'num_input_tokens_seen': 7119466, 'train_runtime': '3601', 'train_tokens_per_second': '1977'} +{'loss': '1.071', 'grad_norm': '2.273', 'learning_rate': '4.378e-05', 'epoch': '0.08759', 'num_input_tokens_seen': 7121513, 'train_runtime': '3602', 'train_tokens_per_second': '1977'} +{'loss': '0.3919', 'grad_norm': '1.401', 'learning_rate': '4.379e-05', 'epoch': '0.08762', 'num_input_tokens_seen': 7123560, 'train_runtime': '3603', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '2.193', 'learning_rate': '4.381e-05', 'epoch': '0.08765', 'num_input_tokens_seen': 7125607, 'train_runtime': '3604', 'train_tokens_per_second': '1977'} +{'loss': '1.546', 'grad_norm': '2.396', 'learning_rate': '4.382e-05', 'epoch': '0.08767', 'num_input_tokens_seen': 7127654, 'train_runtime': '3605', 'train_tokens_per_second': '1977'} +{'loss': '0.799', 'grad_norm': '1.931', 'learning_rate': '4.383e-05', 'epoch': '0.0877', 'num_input_tokens_seen': 7129701, 'train_runtime': '3606', 'train_tokens_per_second': '1977'} +{'loss': '0.3118', 'grad_norm': '1.579', 'learning_rate': '4.384e-05', 'epoch': '0.08772', 'num_input_tokens_seen': 7131748, 'train_runtime': '3607', 'train_tokens_per_second': '1977'} +{'loss': '0.6152', 'grad_norm': '1.431', 'learning_rate': '4.386e-05', 'epoch': '0.08775', 'num_input_tokens_seen': 7133795, 'train_runtime': '3608', 'train_tokens_per_second': '1977'} +{'loss': '0.8018', 'grad_norm': '2.094', 'learning_rate': '4.387e-05', 'epoch': '0.08777', 'num_input_tokens_seen': 7135842, 'train_runtime': '3609', 'train_tokens_per_second': '1977'} +{'loss': '0.4582', 'grad_norm': '2.055', 'learning_rate': '4.388e-05', 'epoch': '0.0878', 'num_input_tokens_seen': 7137889, 'train_runtime': '3610', 'train_tokens_per_second': '1977'} +{'loss': '0.7819', 'grad_norm': '1.733', 'learning_rate': '4.389e-05', 'epoch': '0.08782', 'num_input_tokens_seen': 7139936, 'train_runtime': '3611', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.613', 'learning_rate': '4.391e-05', 'epoch': '0.08785', 'num_input_tokens_seen': 7141983, 'train_runtime': '3612', 'train_tokens_per_second': '1977'} +{'loss': '0.5646', 'grad_norm': '1.669', 'learning_rate': '4.392e-05', 'epoch': '0.08787', 'num_input_tokens_seen': 7144030, 'train_runtime': '3613', 'train_tokens_per_second': '1977'} +{'loss': '0.8688', 'grad_norm': '2.068', 'learning_rate': '4.393e-05', 'epoch': '0.0879', 'num_input_tokens_seen': 7146077, 'train_runtime': '3614', 'train_tokens_per_second': '1977'} +{'loss': '1.84', 'grad_norm': '2.632', 'learning_rate': '4.395e-05', 'epoch': '0.08792', 'num_input_tokens_seen': 7148124, 'train_runtime': '3615', 'train_tokens_per_second': '1977'} +{'loss': '0.9537', 'grad_norm': '1.903', 'learning_rate': '4.396e-05', 'epoch': '0.08795', 'num_input_tokens_seen': 7150171, 'train_runtime': '3616', 'train_tokens_per_second': '1977'} +{'loss': '0.4018', 'grad_norm': '1.58', 'learning_rate': '4.397e-05', 'epoch': '0.08797', 'num_input_tokens_seen': 7152218, 'train_runtime': '3617', 'train_tokens_per_second': '1977'} +{'loss': '0.7732', 'grad_norm': '2.078', 'learning_rate': '4.398e-05', 'epoch': '0.088', 'num_input_tokens_seen': 7154265, 'train_runtime': '3618', 'train_tokens_per_second': '1977'} +{'loss': '0.404', 'grad_norm': '1.482', 'learning_rate': '4.4e-05', 'epoch': '0.08802', 'num_input_tokens_seen': 7156312, 'train_runtime': '3620', 'train_tokens_per_second': '1977'} +{'loss': '1.861', 'grad_norm': '2.987', 'learning_rate': '4.401e-05', 'epoch': '0.08805', 'num_input_tokens_seen': 7158359, 'train_runtime': '3621', 'train_tokens_per_second': '1977'} +{'loss': '0.8305', 'grad_norm': '1.862', 'learning_rate': '4.402e-05', 'epoch': '0.08807', 'num_input_tokens_seen': 7160406, 'train_runtime': '3622', 'train_tokens_per_second': '1977'} +{'loss': '1.677', 'grad_norm': '2.404', 'learning_rate': '4.403e-05', 'epoch': '0.0881', 'num_input_tokens_seen': 7162453, 'train_runtime': '3623', 'train_tokens_per_second': '1977'} +{'loss': '2.284', 'grad_norm': '2.879', 'learning_rate': '4.405e-05', 'epoch': '0.08812', 'num_input_tokens_seen': 7164500, 'train_runtime': '3624', 'train_tokens_per_second': '1977'} +{'loss': '1.683', 'grad_norm': '2.782', 'learning_rate': '4.406e-05', 'epoch': '0.08815', 'num_input_tokens_seen': 7166547, 'train_runtime': '3625', 'train_tokens_per_second': '1977'} +{'loss': '0.3231', 'grad_norm': '1.09', 'learning_rate': '4.407e-05', 'epoch': '0.08817', 'num_input_tokens_seen': 7168594, 'train_runtime': '3626', 'train_tokens_per_second': '1977'} +{'loss': '2.156', 'grad_norm': '3.029', 'learning_rate': '4.408e-05', 'epoch': '0.0882', 'num_input_tokens_seen': 7170641, 'train_runtime': '3627', 'train_tokens_per_second': '1977'} +{'loss': '0.5449', 'grad_norm': '1.841', 'learning_rate': '4.41e-05', 'epoch': '0.08822', 'num_input_tokens_seen': 7172688, 'train_runtime': '3628', 'train_tokens_per_second': '1977'} +{'loss': '1.959', 'grad_norm': '2.295', 'learning_rate': '4.411e-05', 'epoch': '0.08825', 'num_input_tokens_seen': 7174735, 'train_runtime': '3629', 'train_tokens_per_second': '1977'} +{'loss': '1.294', 'grad_norm': '3.161', 'learning_rate': '4.412e-05', 'epoch': '0.08827', 'num_input_tokens_seen': 7176782, 'train_runtime': '3630', 'train_tokens_per_second': '1977'} +{'loss': '1.138', 'grad_norm': '2.058', 'learning_rate': '4.413e-05', 'epoch': '0.0883', 'num_input_tokens_seen': 7178829, 'train_runtime': '3631', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '2.297', 'learning_rate': '4.415e-05', 'epoch': '0.08832', 'num_input_tokens_seen': 7180876, 'train_runtime': '3632', 'train_tokens_per_second': '1977'} +{'loss': '2.562', 'grad_norm': '3.46', 'learning_rate': '4.416e-05', 'epoch': '0.08835', 'num_input_tokens_seen': 7182923, 'train_runtime': '3633', 'train_tokens_per_second': '1977'} +{'loss': '2.375', 'grad_norm': '2.784', 'learning_rate': '4.417e-05', 'epoch': '0.08838', 'num_input_tokens_seen': 7184970, 'train_runtime': '3634', 'train_tokens_per_second': '1977'} +{'loss': '0.5481', 'grad_norm': '2.219', 'learning_rate': '4.418e-05', 'epoch': '0.0884', 'num_input_tokens_seen': 7187017, 'train_runtime': '3635', 'train_tokens_per_second': '1977'} +{'loss': '0.7176', 'grad_norm': '1.919', 'learning_rate': '4.42e-05', 'epoch': '0.08843', 'num_input_tokens_seen': 7189064, 'train_runtime': '3636', 'train_tokens_per_second': '1977'} +{'loss': '0.4177', 'grad_norm': '1.698', 'learning_rate': '4.421e-05', 'epoch': '0.08845', 'num_input_tokens_seen': 7191111, 'train_runtime': '3637', 'train_tokens_per_second': '1977'} +{'loss': '0.9861', 'grad_norm': '1.953', 'learning_rate': '4.422e-05', 'epoch': '0.08848', 'num_input_tokens_seen': 7193158, 'train_runtime': '3638', 'train_tokens_per_second': '1977'} +{'loss': '3.244', 'grad_norm': '3.575', 'learning_rate': '4.423e-05', 'epoch': '0.0885', 'num_input_tokens_seen': 7195205, 'train_runtime': '3639', 'train_tokens_per_second': '1977'} +{'loss': '0.7182', 'grad_norm': '2.104', 'learning_rate': '4.425e-05', 'epoch': '0.08853', 'num_input_tokens_seen': 7197252, 'train_runtime': '3640', 'train_tokens_per_second': '1977'} +{'loss': '0.7076', 'grad_norm': '1.521', 'learning_rate': '4.426e-05', 'epoch': '0.08855', 'num_input_tokens_seen': 7199299, 'train_runtime': '3641', 'train_tokens_per_second': '1977'} +{'loss': '1.233', 'grad_norm': '2.59', 'learning_rate': '4.427e-05', 'epoch': '0.08858', 'num_input_tokens_seen': 7201346, 'train_runtime': '3642', 'train_tokens_per_second': '1977'} +{'loss': '2.449', 'grad_norm': '2.64', 'learning_rate': '4.428e-05', 'epoch': '0.0886', 'num_input_tokens_seen': 7203393, 'train_runtime': '3643', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '2.459', 'learning_rate': '4.43e-05', 'epoch': '0.08863', 'num_input_tokens_seen': 7205440, 'train_runtime': '3644', 'train_tokens_per_second': '1977'} +{'loss': '1.006', 'grad_norm': '2.245', 'learning_rate': '4.431e-05', 'epoch': '0.08865', 'num_input_tokens_seen': 7207487, 'train_runtime': '3645', 'train_tokens_per_second': '1977'} +{'loss': '1.667', 'grad_norm': '2.856', 'learning_rate': '4.432e-05', 'epoch': '0.08868', 'num_input_tokens_seen': 7209534, 'train_runtime': '3646', 'train_tokens_per_second': '1977'} +{'loss': '2.834', 'grad_norm': '3.39', 'learning_rate': '4.434e-05', 'epoch': '0.0887', 'num_input_tokens_seen': 7211581, 'train_runtime': '3647', 'train_tokens_per_second': '1977'} +{'loss': '0.73', 'grad_norm': '2', 'learning_rate': '4.435e-05', 'epoch': '0.08873', 'num_input_tokens_seen': 7213628, 'train_runtime': '3648', 'train_tokens_per_second': '1977'} +{'loss': '0.7614', 'grad_norm': '2.397', 'learning_rate': '4.436e-05', 'epoch': '0.08875', 'num_input_tokens_seen': 7215675, 'train_runtime': '3649', 'train_tokens_per_second': '1977'} +{'loss': '0.7119', 'grad_norm': '1.717', 'learning_rate': '4.437e-05', 'epoch': '0.08878', 'num_input_tokens_seen': 7217722, 'train_runtime': '3651', 'train_tokens_per_second': '1977'} +{'loss': '0.7151', 'grad_norm': '1.72', 'learning_rate': '4.439e-05', 'epoch': '0.0888', 'num_input_tokens_seen': 7219769, 'train_runtime': '3652', 'train_tokens_per_second': '1977'} +{'loss': '1.282', 'grad_norm': '2.354', 'learning_rate': '4.44e-05', 'epoch': '0.08883', 'num_input_tokens_seen': 7221816, 'train_runtime': '3653', 'train_tokens_per_second': '1977'} +{'loss': '1.784', 'grad_norm': '3.032', 'learning_rate': '4.441e-05', 'epoch': '0.08885', 'num_input_tokens_seen': 7223863, 'train_runtime': '3654', 'train_tokens_per_second': '1977'} +{'loss': '0.4683', 'grad_norm': '1.844', 'learning_rate': '4.442e-05', 'epoch': '0.08888', 'num_input_tokens_seen': 7225910, 'train_runtime': '3655', 'train_tokens_per_second': '1977'} +{'loss': '1.922', 'grad_norm': '3.202', 'learning_rate': '4.444e-05', 'epoch': '0.0889', 'num_input_tokens_seen': 7227957, 'train_runtime': '3656', 'train_tokens_per_second': '1977'} +{'loss': '1.935', 'grad_norm': '2.577', 'learning_rate': '4.445e-05', 'epoch': '0.08893', 'num_input_tokens_seen': 7230004, 'train_runtime': '3657', 'train_tokens_per_second': '1977'} +{'loss': '0.5949', 'grad_norm': '1.623', 'learning_rate': '4.446e-05', 'epoch': '0.08895', 'num_input_tokens_seen': 7232051, 'train_runtime': '3658', 'train_tokens_per_second': '1977'} +{'loss': '0.4488', 'grad_norm': '1.65', 'learning_rate': '4.447e-05', 'epoch': '0.08898', 'num_input_tokens_seen': 7234098, 'train_runtime': '3659', 'train_tokens_per_second': '1977'} +{'loss': '0.4825', 'grad_norm': '1.612', 'learning_rate': '4.449e-05', 'epoch': '0.089', 'num_input_tokens_seen': 7236145, 'train_runtime': '3660', 'train_tokens_per_second': '1977'} +{'loss': '0.8476', 'grad_norm': '2.247', 'learning_rate': '4.45e-05', 'epoch': '0.08903', 'num_input_tokens_seen': 7238192, 'train_runtime': '3661', 'train_tokens_per_second': '1977'} +{'loss': '1.104', 'grad_norm': '2.304', 'learning_rate': '4.451e-05', 'epoch': '0.08906', 'num_input_tokens_seen': 7240239, 'train_runtime': '3662', 'train_tokens_per_second': '1977'} +{'loss': '1.526', 'grad_norm': '2.799', 'learning_rate': '4.452e-05', 'epoch': '0.08908', 'num_input_tokens_seen': 7242286, 'train_runtime': '3663', 'train_tokens_per_second': '1977'} +{'loss': '0.4375', 'grad_norm': '1.526', 'learning_rate': '4.454e-05', 'epoch': '0.08911', 'num_input_tokens_seen': 7244333, 'train_runtime': '3664', 'train_tokens_per_second': '1977'} +{'loss': '0.9749', 'grad_norm': '1.828', 'learning_rate': '4.455e-05', 'epoch': '0.08913', 'num_input_tokens_seen': 7246380, 'train_runtime': '3665', 'train_tokens_per_second': '1977'} +{'loss': '0.3745', 'grad_norm': '1.159', 'learning_rate': '4.456e-05', 'epoch': '0.08916', 'num_input_tokens_seen': 7248427, 'train_runtime': '3666', 'train_tokens_per_second': '1977'} +{'loss': '0.3451', 'grad_norm': '1.367', 'learning_rate': '4.457e-05', 'epoch': '0.08918', 'num_input_tokens_seen': 7250474, 'train_runtime': '3667', 'train_tokens_per_second': '1977'} +{'loss': '0.3856', 'grad_norm': '1.389', 'learning_rate': '4.459e-05', 'epoch': '0.08921', 'num_input_tokens_seen': 7252521, 'train_runtime': '3668', 'train_tokens_per_second': '1977'} +{'loss': '0.4478', 'grad_norm': '1.485', 'learning_rate': '4.46e-05', 'epoch': '0.08923', 'num_input_tokens_seen': 7254568, 'train_runtime': '3669', 'train_tokens_per_second': '1977'} +{'loss': '0.4468', 'grad_norm': '1.469', 'learning_rate': '4.461e-05', 'epoch': '0.08926', 'num_input_tokens_seen': 7256615, 'train_runtime': '3670', 'train_tokens_per_second': '1977'} +{'loss': '1.384', 'grad_norm': '2.953', 'learning_rate': '4.462e-05', 'epoch': '0.08928', 'num_input_tokens_seen': 7258662, 'train_runtime': '3671', 'train_tokens_per_second': '1977'} +{'loss': '0.6492', 'grad_norm': '2.297', 'learning_rate': '4.464e-05', 'epoch': '0.08931', 'num_input_tokens_seen': 7260709, 'train_runtime': '3672', 'train_tokens_per_second': '1977'} +{'loss': '1.134', 'grad_norm': '2.896', 'learning_rate': '4.465e-05', 'epoch': '0.08933', 'num_input_tokens_seen': 7262756, 'train_runtime': '3673', 'train_tokens_per_second': '1977'} +{'loss': '1.439', 'grad_norm': '3.014', 'learning_rate': '4.466e-05', 'epoch': '0.08936', 'num_input_tokens_seen': 7264803, 'train_runtime': '3674', 'train_tokens_per_second': '1977'} +{'loss': '1.178', 'grad_norm': '2.516', 'learning_rate': '4.468e-05', 'epoch': '0.08938', 'num_input_tokens_seen': 7266850, 'train_runtime': '3675', 'train_tokens_per_second': '1977'} +{'loss': '0.9205', 'grad_norm': '2.073', 'learning_rate': '4.469e-05', 'epoch': '0.08941', 'num_input_tokens_seen': 7268897, 'train_runtime': '3676', 'train_tokens_per_second': '1977'} +{'loss': '0.9762', 'grad_norm': '1.861', 'learning_rate': '4.47e-05', 'epoch': '0.08943', 'num_input_tokens_seen': 7270944, 'train_runtime': '3677', 'train_tokens_per_second': '1977'} +{'loss': '0.7189', 'grad_norm': '2.516', 'learning_rate': '4.471e-05', 'epoch': '0.08946', 'num_input_tokens_seen': 7272991, 'train_runtime': '3678', 'train_tokens_per_second': '1977'} +{'loss': '0.9611', 'grad_norm': '2.327', 'learning_rate': '4.473e-05', 'epoch': '0.08948', 'num_input_tokens_seen': 7275038, 'train_runtime': '3679', 'train_tokens_per_second': '1977'} +{'loss': '0.4223', 'grad_norm': '1.478', 'learning_rate': '4.474e-05', 'epoch': '0.08951', 'num_input_tokens_seen': 7277085, 'train_runtime': '3681', 'train_tokens_per_second': '1977'} +{'loss': '0.4387', 'grad_norm': '1.39', 'learning_rate': '4.475e-05', 'epoch': '0.08953', 'num_input_tokens_seen': 7279132, 'train_runtime': '3682', 'train_tokens_per_second': '1977'} +{'loss': '0.4889', 'grad_norm': '1.539', 'learning_rate': '4.476e-05', 'epoch': '0.08956', 'num_input_tokens_seen': 7281179, 'train_runtime': '3683', 'train_tokens_per_second': '1977'} +{'loss': '0.3554', 'grad_norm': '1.508', 'learning_rate': '4.478e-05', 'epoch': '0.08958', 'num_input_tokens_seen': 7283226, 'train_runtime': '3684', 'train_tokens_per_second': '1977'} +{'loss': '0.3278', 'grad_norm': '1.461', 'learning_rate': '4.479e-05', 'epoch': '0.08961', 'num_input_tokens_seen': 7285273, 'train_runtime': '3685', 'train_tokens_per_second': '1977'} +{'loss': '0.9228', 'grad_norm': '1.66', 'learning_rate': '4.48e-05', 'epoch': '0.08963', 'num_input_tokens_seen': 7287320, 'train_runtime': '3686', 'train_tokens_per_second': '1977'} +{'loss': '0.6478', 'grad_norm': '2.078', 'learning_rate': '4.481e-05', 'epoch': '0.08966', 'num_input_tokens_seen': 7289367, 'train_runtime': '3687', 'train_tokens_per_second': '1977'} +{'loss': '1.096', 'grad_norm': '2.327', 'learning_rate': '4.483e-05', 'epoch': '0.08968', 'num_input_tokens_seen': 7291414, 'train_runtime': '3688', 'train_tokens_per_second': '1977'} +{'loss': '0.7661', 'grad_norm': '1.603', 'learning_rate': '4.484e-05', 'epoch': '0.08971', 'num_input_tokens_seen': 7293461, 'train_runtime': '3689', 'train_tokens_per_second': '1977'} +{'loss': '0.8893', 'grad_norm': '2.135', 'learning_rate': '4.485e-05', 'epoch': '0.08973', 'num_input_tokens_seen': 7295508, 'train_runtime': '3690', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '2.738', 'learning_rate': '4.486e-05', 'epoch': '0.08976', 'num_input_tokens_seen': 7297555, 'train_runtime': '3691', 'train_tokens_per_second': '1977'} +{'loss': '1.65', 'grad_norm': '2.695', 'learning_rate': '4.488e-05', 'epoch': '0.08979', 'num_input_tokens_seen': 7299602, 'train_runtime': '3692', 'train_tokens_per_second': '1977'} +{'loss': '0.5915', 'grad_norm': '1.635', 'learning_rate': '4.489e-05', 'epoch': '0.08981', 'num_input_tokens_seen': 7301649, 'train_runtime': '3693', 'train_tokens_per_second': '1977'} +{'loss': '0.9867', 'grad_norm': '2.045', 'learning_rate': '4.49e-05', 'epoch': '0.08984', 'num_input_tokens_seen': 7303696, 'train_runtime': '3694', 'train_tokens_per_second': '1977'} +{'loss': '0.3553', 'grad_norm': '1.339', 'learning_rate': '4.491e-05', 'epoch': '0.08986', 'num_input_tokens_seen': 7305743, 'train_runtime': '3695', 'train_tokens_per_second': '1977'} +{'loss': '0.9892', 'grad_norm': '2.246', 'learning_rate': '4.493e-05', 'epoch': '0.08989', 'num_input_tokens_seen': 7307790, 'train_runtime': '3696', 'train_tokens_per_second': '1977'} +{'loss': '0.3088', 'grad_norm': '1.391', 'learning_rate': '4.494e-05', 'epoch': '0.08991', 'num_input_tokens_seen': 7309837, 'train_runtime': '3697', 'train_tokens_per_second': '1977'} +{'loss': '0.7081', 'grad_norm': '1.575', 'learning_rate': '4.495e-05', 'epoch': '0.08994', 'num_input_tokens_seen': 7311884, 'train_runtime': '3698', 'train_tokens_per_second': '1977'} +{'loss': '1.437', 'grad_norm': '2.895', 'learning_rate': '4.496e-05', 'epoch': '0.08996', 'num_input_tokens_seen': 7313931, 'train_runtime': '3699', 'train_tokens_per_second': '1977'} +{'loss': '0.5341', 'grad_norm': '1.587', 'learning_rate': '4.498e-05', 'epoch': '0.08999', 'num_input_tokens_seen': 7315978, 'train_runtime': '3700', 'train_tokens_per_second': '1977'} +{'loss': '0.8273', 'grad_norm': '2.091', 'learning_rate': '4.499e-05', 'epoch': '0.09001', 'num_input_tokens_seen': 7318025, 'train_runtime': '3701', 'train_tokens_per_second': '1977'} +{'loss': '1.027', 'grad_norm': '2.155', 'learning_rate': '4.5e-05', 'epoch': '0.09004', 'num_input_tokens_seen': 7320072, 'train_runtime': '3702', 'train_tokens_per_second': '1977'} +{'loss': '1.211', 'grad_norm': '2.531', 'learning_rate': '4.502e-05', 'epoch': '0.09006', 'num_input_tokens_seen': 7322119, 'train_runtime': '3703', 'train_tokens_per_second': '1977'} +{'loss': '0.2988', 'grad_norm': '1.786', 'learning_rate': '4.503e-05', 'epoch': '0.09009', 'num_input_tokens_seen': 7324166, 'train_runtime': '3704', 'train_tokens_per_second': '1977'} +{'loss': '0.6259', 'grad_norm': '1.858', 'learning_rate': '4.504e-05', 'epoch': '0.09011', 'num_input_tokens_seen': 7326213, 'train_runtime': '3705', 'train_tokens_per_second': '1977'} +{'loss': '0.4903', 'grad_norm': '1.738', 'learning_rate': '4.505e-05', 'epoch': '0.09014', 'num_input_tokens_seen': 7328260, 'train_runtime': '3706', 'train_tokens_per_second': '1977'} +{'loss': '0.3557', 'grad_norm': '1.618', 'learning_rate': '4.507e-05', 'epoch': '0.09016', 'num_input_tokens_seen': 7330307, 'train_runtime': '3707', 'train_tokens_per_second': '1977'} +{'loss': '1.139', 'grad_norm': '2.16', 'learning_rate': '4.508e-05', 'epoch': '0.09019', 'num_input_tokens_seen': 7332354, 'train_runtime': '3708', 'train_tokens_per_second': '1977'} +{'loss': '1.722', 'grad_norm': '3.155', 'learning_rate': '4.509e-05', 'epoch': '0.09021', 'num_input_tokens_seen': 7334401, 'train_runtime': '3709', 'train_tokens_per_second': '1977'} +{'loss': '0.8295', 'grad_norm': '2.106', 'learning_rate': '4.51e-05', 'epoch': '0.09024', 'num_input_tokens_seen': 7336448, 'train_runtime': '3711', 'train_tokens_per_second': '1977'} +{'loss': '0.5662', 'grad_norm': '1.909', 'learning_rate': '4.512e-05', 'epoch': '0.09026', 'num_input_tokens_seen': 7338495, 'train_runtime': '3712', 'train_tokens_per_second': '1977'} +{'loss': '0.5695', 'grad_norm': '1.899', 'learning_rate': '4.513e-05', 'epoch': '0.09029', 'num_input_tokens_seen': 7340542, 'train_runtime': '3713', 'train_tokens_per_second': '1977'} +{'loss': '0.978', 'grad_norm': '2.369', 'learning_rate': '4.514e-05', 'epoch': '0.09031', 'num_input_tokens_seen': 7342589, 'train_runtime': '3714', 'train_tokens_per_second': '1977'} +{'loss': '2.465', 'grad_norm': '3.345', 'learning_rate': '4.515e-05', 'epoch': '0.09034', 'num_input_tokens_seen': 7344636, 'train_runtime': '3715', 'train_tokens_per_second': '1977'} +{'loss': '1.687', 'grad_norm': '2.831', 'learning_rate': '4.517e-05', 'epoch': '0.09036', 'num_input_tokens_seen': 7346683, 'train_runtime': '3716', 'train_tokens_per_second': '1977'} +{'loss': '0.8556', 'grad_norm': '2.124', 'learning_rate': '4.518e-05', 'epoch': '0.09039', 'num_input_tokens_seen': 7348730, 'train_runtime': '3717', 'train_tokens_per_second': '1977'} +{'loss': '0.6924', 'grad_norm': '2.149', 'learning_rate': '4.519e-05', 'epoch': '0.09041', 'num_input_tokens_seen': 7350777, 'train_runtime': '3718', 'train_tokens_per_second': '1977'} +{'loss': '0.8489', 'grad_norm': '2.135', 'learning_rate': '4.52e-05', 'epoch': '0.09044', 'num_input_tokens_seen': 7352824, 'train_runtime': '3719', 'train_tokens_per_second': '1977'} +{'loss': '0.3865', 'grad_norm': '1.437', 'learning_rate': '4.522e-05', 'epoch': '0.09047', 'num_input_tokens_seen': 7354871, 'train_runtime': '3720', 'train_tokens_per_second': '1977'} +{'loss': '1.297', 'grad_norm': '2.661', 'learning_rate': '4.523e-05', 'epoch': '0.09049', 'num_input_tokens_seen': 7356918, 'train_runtime': '3721', 'train_tokens_per_second': '1977'} +{'loss': '2.014', 'grad_norm': '2.542', 'learning_rate': '4.524e-05', 'epoch': '0.09052', 'num_input_tokens_seen': 7358965, 'train_runtime': '3722', 'train_tokens_per_second': '1977'} +{'loss': '0.7318', 'grad_norm': '1.63', 'learning_rate': '4.525e-05', 'epoch': '0.09054', 'num_input_tokens_seen': 7361012, 'train_runtime': '3723', 'train_tokens_per_second': '1977'} +{'loss': '0.562', 'grad_norm': '1.755', 'learning_rate': '4.527e-05', 'epoch': '0.09057', 'num_input_tokens_seen': 7363059, 'train_runtime': '3724', 'train_tokens_per_second': '1977'} +{'loss': '0.4617', 'grad_norm': '1.432', 'learning_rate': '4.528e-05', 'epoch': '0.09059', 'num_input_tokens_seen': 7365106, 'train_runtime': '3725', 'train_tokens_per_second': '1977'} +{'loss': '1.896', 'grad_norm': '2.322', 'learning_rate': '4.529e-05', 'epoch': '0.09062', 'num_input_tokens_seen': 7367153, 'train_runtime': '3726', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '2.692', 'learning_rate': '4.53e-05', 'epoch': '0.09064', 'num_input_tokens_seen': 7369200, 'train_runtime': '3727', 'train_tokens_per_second': '1977'} +{'loss': '0.5118', 'grad_norm': '1.901', 'learning_rate': '4.532e-05', 'epoch': '0.09067', 'num_input_tokens_seen': 7371247, 'train_runtime': '3728', 'train_tokens_per_second': '1977'} +{'loss': '2.092', 'grad_norm': '2.792', 'learning_rate': '4.533e-05', 'epoch': '0.09069', 'num_input_tokens_seen': 7373294, 'train_runtime': '3729', 'train_tokens_per_second': '1977'} +{'loss': '1.415', 'grad_norm': '2.442', 'learning_rate': '4.534e-05', 'epoch': '0.09072', 'num_input_tokens_seen': 7375341, 'train_runtime': '3730', 'train_tokens_per_second': '1977'} +{'loss': '1.267', 'grad_norm': '2.281', 'learning_rate': '4.535e-05', 'epoch': '0.09074', 'num_input_tokens_seen': 7377388, 'train_runtime': '3731', 'train_tokens_per_second': '1977'} +{'loss': '0.9549', 'grad_norm': '2.35', 'learning_rate': '4.537e-05', 'epoch': '0.09077', 'num_input_tokens_seen': 7379435, 'train_runtime': '3732', 'train_tokens_per_second': '1977'} +{'loss': '0.6114', 'grad_norm': '1.67', 'learning_rate': '4.538e-05', 'epoch': '0.09079', 'num_input_tokens_seen': 7381482, 'train_runtime': '3733', 'train_tokens_per_second': '1977'} +{'loss': '1.404', 'grad_norm': '2.801', 'learning_rate': '4.539e-05', 'epoch': '0.09082', 'num_input_tokens_seen': 7383529, 'train_runtime': '3734', 'train_tokens_per_second': '1977'} +{'loss': '1.197', 'grad_norm': '2.746', 'learning_rate': '4.541e-05', 'epoch': '0.09084', 'num_input_tokens_seen': 7385576, 'train_runtime': '3735', 'train_tokens_per_second': '1977'} +{'loss': '0.6402', 'grad_norm': '1.959', 'learning_rate': '4.542e-05', 'epoch': '0.09087', 'num_input_tokens_seen': 7387623, 'train_runtime': '3736', 'train_tokens_per_second': '1977'} +{'loss': '2.367', 'grad_norm': '3.105', 'learning_rate': '4.543e-05', 'epoch': '0.09089', 'num_input_tokens_seen': 7389670, 'train_runtime': '3737', 'train_tokens_per_second': '1977'} +{'loss': '0.7748', 'grad_norm': '2.103', 'learning_rate': '4.544e-05', 'epoch': '0.09092', 'num_input_tokens_seen': 7391717, 'train_runtime': '3738', 'train_tokens_per_second': '1977'} +{'loss': '0.9302', 'grad_norm': '2.309', 'learning_rate': '4.546e-05', 'epoch': '0.09094', 'num_input_tokens_seen': 7393764, 'train_runtime': '3739', 'train_tokens_per_second': '1977'} +{'loss': '0.7652', 'grad_norm': '1.783', 'learning_rate': '4.547e-05', 'epoch': '0.09097', 'num_input_tokens_seen': 7395811, 'train_runtime': '3740', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '2.434', 'learning_rate': '4.548e-05', 'epoch': '0.09099', 'num_input_tokens_seen': 7397858, 'train_runtime': '3742', 'train_tokens_per_second': '1977'} +{'loss': '0.5478', 'grad_norm': '1.782', 'learning_rate': '4.549e-05', 'epoch': '0.09102', 'num_input_tokens_seen': 7399905, 'train_runtime': '3743', 'train_tokens_per_second': '1977'} +{'loss': '1.296', 'grad_norm': '2.394', 'learning_rate': '4.551e-05', 'epoch': '0.09104', 'num_input_tokens_seen': 7401952, 'train_runtime': '3744', 'train_tokens_per_second': '1977'} +{'loss': '0.8471', 'grad_norm': '2.039', 'learning_rate': '4.552e-05', 'epoch': '0.09107', 'num_input_tokens_seen': 7403999, 'train_runtime': '3745', 'train_tokens_per_second': '1977'} +{'loss': '1.066', 'grad_norm': '2.205', 'learning_rate': '4.553e-05', 'epoch': '0.09109', 'num_input_tokens_seen': 7406046, 'train_runtime': '3746', 'train_tokens_per_second': '1977'} +{'loss': '1.611', 'grad_norm': '2.507', 'learning_rate': '4.554e-05', 'epoch': '0.09112', 'num_input_tokens_seen': 7408093, 'train_runtime': '3747', 'train_tokens_per_second': '1977'} +{'loss': '1.374', 'grad_norm': '3.9', 'learning_rate': '4.556e-05', 'epoch': '0.09114', 'num_input_tokens_seen': 7410140, 'train_runtime': '3748', 'train_tokens_per_second': '1977'} +{'loss': '0.4834', 'grad_norm': '1.502', 'learning_rate': '4.557e-05', 'epoch': '0.09117', 'num_input_tokens_seen': 7412187, 'train_runtime': '3749', 'train_tokens_per_second': '1977'} +{'loss': '0.8138', 'grad_norm': '2.102', 'learning_rate': '4.558e-05', 'epoch': '0.0912', 'num_input_tokens_seen': 7414234, 'train_runtime': '3750', 'train_tokens_per_second': '1977'} +{'loss': '0.7538', 'grad_norm': '2.176', 'learning_rate': '4.559e-05', 'epoch': '0.09122', 'num_input_tokens_seen': 7416281, 'train_runtime': '3751', 'train_tokens_per_second': '1977'} +{'loss': '0.3368', 'grad_norm': '1.275', 'learning_rate': '4.561e-05', 'epoch': '0.09125', 'num_input_tokens_seen': 7418328, 'train_runtime': '3752', 'train_tokens_per_second': '1977'} +{'loss': '2.022', 'grad_norm': '2.288', 'learning_rate': '4.562e-05', 'epoch': '0.09127', 'num_input_tokens_seen': 7420375, 'train_runtime': '3753', 'train_tokens_per_second': '1977'} +{'loss': '1.042', 'grad_norm': '1.914', 'learning_rate': '4.563e-05', 'epoch': '0.0913', 'num_input_tokens_seen': 7422422, 'train_runtime': '3754', 'train_tokens_per_second': '1977'} +{'loss': '2.329', 'grad_norm': '2.897', 'learning_rate': '4.564e-05', 'epoch': '0.09132', 'num_input_tokens_seen': 7424469, 'train_runtime': '3755', 'train_tokens_per_second': '1977'} +{'loss': '1.668', 'grad_norm': '2.745', 'learning_rate': '4.566e-05', 'epoch': '0.09135', 'num_input_tokens_seen': 7426516, 'train_runtime': '3756', 'train_tokens_per_second': '1977'} +{'loss': '0.6609', 'grad_norm': '2.419', 'learning_rate': '4.567e-05', 'epoch': '0.09137', 'num_input_tokens_seen': 7428563, 'train_runtime': '3757', 'train_tokens_per_second': '1977'} +{'loss': '1.463', 'grad_norm': '2.352', 'learning_rate': '4.568e-05', 'epoch': '0.0914', 'num_input_tokens_seen': 7430610, 'train_runtime': '3758', 'train_tokens_per_second': '1977'} +{'loss': '0.4893', 'grad_norm': '1.65', 'learning_rate': '4.569e-05', 'epoch': '0.09142', 'num_input_tokens_seen': 7432657, 'train_runtime': '3759', 'train_tokens_per_second': '1977'} +{'loss': '0.8627', 'grad_norm': '2.061', 'learning_rate': '4.571e-05', 'epoch': '0.09145', 'num_input_tokens_seen': 7434704, 'train_runtime': '3760', 'train_tokens_per_second': '1977'} +{'loss': '2.741', 'grad_norm': '2.727', 'learning_rate': '4.572e-05', 'epoch': '0.09147', 'num_input_tokens_seen': 7436751, 'train_runtime': '3761', 'train_tokens_per_second': '1977'} +{'loss': '1.171', 'grad_norm': '2.198', 'learning_rate': '4.573e-05', 'epoch': '0.0915', 'num_input_tokens_seen': 7438798, 'train_runtime': '3762', 'train_tokens_per_second': '1977'} +{'loss': '0.3483', 'grad_norm': '1.696', 'learning_rate': '4.575e-05', 'epoch': '0.09152', 'num_input_tokens_seen': 7440845, 'train_runtime': '3763', 'train_tokens_per_second': '1977'} +{'loss': '1.879', 'grad_norm': '3.019', 'learning_rate': '4.576e-05', 'epoch': '0.09155', 'num_input_tokens_seen': 7442892, 'train_runtime': '3764', 'train_tokens_per_second': '1977'} +{'loss': '0.8833', 'grad_norm': '2.189', 'learning_rate': '4.577e-05', 'epoch': '0.09157', 'num_input_tokens_seen': 7444939, 'train_runtime': '3765', 'train_tokens_per_second': '1977'} +{'loss': '0.6545', 'grad_norm': '2.691', 'learning_rate': '4.578e-05', 'epoch': '0.0916', 'num_input_tokens_seen': 7446986, 'train_runtime': '3766', 'train_tokens_per_second': '1977'} +{'loss': '0.3122', 'grad_norm': '1.221', 'learning_rate': '4.58e-05', 'epoch': '0.09162', 'num_input_tokens_seen': 7449033, 'train_runtime': '3767', 'train_tokens_per_second': '1977'} +{'loss': '1.204', 'grad_norm': '2.043', 'learning_rate': '4.581e-05', 'epoch': '0.09165', 'num_input_tokens_seen': 7451080, 'train_runtime': '3768', 'train_tokens_per_second': '1977'} +{'loss': '0.7017', 'grad_norm': '2.068', 'learning_rate': '4.582e-05', 'epoch': '0.09167', 'num_input_tokens_seen': 7453127, 'train_runtime': '3769', 'train_tokens_per_second': '1977'} +{'loss': '0.8904', 'grad_norm': '2.113', 'learning_rate': '4.583e-05', 'epoch': '0.0917', 'num_input_tokens_seen': 7455174, 'train_runtime': '3770', 'train_tokens_per_second': '1977'} +{'loss': '0.7502', 'grad_norm': '2.482', 'learning_rate': '4.585e-05', 'epoch': '0.09172', 'num_input_tokens_seen': 7457221, 'train_runtime': '3772', 'train_tokens_per_second': '1977'} +{'loss': '1.158', 'grad_norm': '2.508', 'learning_rate': '4.586e-05', 'epoch': '0.09175', 'num_input_tokens_seen': 7459268, 'train_runtime': '3773', 'train_tokens_per_second': '1977'} +{'loss': '1.089', 'grad_norm': '2.036', 'learning_rate': '4.587e-05', 'epoch': '0.09177', 'num_input_tokens_seen': 7461315, 'train_runtime': '3774', 'train_tokens_per_second': '1977'} +{'loss': '1.058', 'grad_norm': '2.068', 'learning_rate': '4.588e-05', 'epoch': '0.0918', 'num_input_tokens_seen': 7463362, 'train_runtime': '3775', 'train_tokens_per_second': '1977'} +{'loss': '0.9303', 'grad_norm': '1.591', 'learning_rate': '4.59e-05', 'epoch': '0.09182', 'num_input_tokens_seen': 7465409, 'train_runtime': '3776', 'train_tokens_per_second': '1977'} +{'loss': '0.5339', 'grad_norm': '1.77', 'learning_rate': '4.591e-05', 'epoch': '0.09185', 'num_input_tokens_seen': 7467456, 'train_runtime': '3777', 'train_tokens_per_second': '1977'} +{'loss': '0.9482', 'grad_norm': '1.983', 'learning_rate': '4.592e-05', 'epoch': '0.09188', 'num_input_tokens_seen': 7469503, 'train_runtime': '3778', 'train_tokens_per_second': '1977'} +{'loss': '0.3249', 'grad_norm': '1.537', 'learning_rate': '4.593e-05', 'epoch': '0.0919', 'num_input_tokens_seen': 7471550, 'train_runtime': '3779', 'train_tokens_per_second': '1977'} +{'loss': '0.3935', 'grad_norm': '1.626', 'learning_rate': '4.595e-05', 'epoch': '0.09193', 'num_input_tokens_seen': 7473597, 'train_runtime': '3780', 'train_tokens_per_second': '1977'} +{'loss': '0.7102', 'grad_norm': '1.74', 'learning_rate': '4.596e-05', 'epoch': '0.09195', 'num_input_tokens_seen': 7475644, 'train_runtime': '3781', 'train_tokens_per_second': '1977'} +{'loss': '1.317', 'grad_norm': '4.002', 'learning_rate': '4.597e-05', 'epoch': '0.09198', 'num_input_tokens_seen': 7477691, 'train_runtime': '3782', 'train_tokens_per_second': '1977'} +{'loss': '0.6385', 'grad_norm': '1.476', 'learning_rate': '4.598e-05', 'epoch': '0.092', 'num_input_tokens_seen': 7479738, 'train_runtime': '3783', 'train_tokens_per_second': '1977'} +{'loss': '0.8885', 'grad_norm': '2.329', 'learning_rate': '4.6e-05', 'epoch': '0.09203', 'num_input_tokens_seen': 7481785, 'train_runtime': '3784', 'train_tokens_per_second': '1977'} +{'loss': '0.6388', 'grad_norm': '1.6', 'learning_rate': '4.601e-05', 'epoch': '0.09205', 'num_input_tokens_seen': 7483832, 'train_runtime': '3785', 'train_tokens_per_second': '1977'} +{'loss': '0.8619', 'grad_norm': '1.616', 'learning_rate': '4.602e-05', 'epoch': '0.09208', 'num_input_tokens_seen': 7485879, 'train_runtime': '3786', 'train_tokens_per_second': '1977'} +{'loss': '0.9995', 'grad_norm': '1.941', 'learning_rate': '4.603e-05', 'epoch': '0.0921', 'num_input_tokens_seen': 7487926, 'train_runtime': '3787', 'train_tokens_per_second': '1977'} +{'loss': '0.5067', 'grad_norm': '1.497', 'learning_rate': '4.605e-05', 'epoch': '0.09213', 'num_input_tokens_seen': 7489973, 'train_runtime': '3788', 'train_tokens_per_second': '1977'} +{'loss': '0.907', 'grad_norm': '2.112', 'learning_rate': '4.606e-05', 'epoch': '0.09215', 'num_input_tokens_seen': 7492020, 'train_runtime': '3789', 'train_tokens_per_second': '1977'} +{'loss': '0.7157', 'grad_norm': '1.987', 'learning_rate': '4.607e-05', 'epoch': '0.09218', 'num_input_tokens_seen': 7494067, 'train_runtime': '3790', 'train_tokens_per_second': '1977'} +{'loss': '1.373', 'grad_norm': '2.856', 'learning_rate': '4.609e-05', 'epoch': '0.0922', 'num_input_tokens_seen': 7496114, 'train_runtime': '3791', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '2.34', 'learning_rate': '4.61e-05', 'epoch': '0.09223', 'num_input_tokens_seen': 7498161, 'train_runtime': '3792', 'train_tokens_per_second': '1977'} +{'loss': '0.5538', 'grad_norm': '1.423', 'learning_rate': '4.611e-05', 'epoch': '0.09225', 'num_input_tokens_seen': 7500208, 'train_runtime': '3793', 'train_tokens_per_second': '1977'} +{'loss': '0.9639', 'grad_norm': '2.138', 'learning_rate': '4.612e-05', 'epoch': '0.09228', 'num_input_tokens_seen': 7502255, 'train_runtime': '3794', 'train_tokens_per_second': '1977'} +{'loss': '1.179', 'grad_norm': '2.15', 'learning_rate': '4.614e-05', 'epoch': '0.0923', 'num_input_tokens_seen': 7504302, 'train_runtime': '3795', 'train_tokens_per_second': '1977'} +{'loss': '0.7321', 'grad_norm': '1.909', 'learning_rate': '4.615e-05', 'epoch': '0.09233', 'num_input_tokens_seen': 7506349, 'train_runtime': '3796', 'train_tokens_per_second': '1977'} +{'loss': '0.9928', 'grad_norm': '2.391', 'learning_rate': '4.616e-05', 'epoch': '0.09235', 'num_input_tokens_seen': 7508396, 'train_runtime': '3797', 'train_tokens_per_second': '1977'} +{'loss': '0.3585', 'grad_norm': '1.837', 'learning_rate': '4.617e-05', 'epoch': '0.09238', 'num_input_tokens_seen': 7510443, 'train_runtime': '3798', 'train_tokens_per_second': '1977'} +{'loss': '0.47', 'grad_norm': '1.369', 'learning_rate': '4.619e-05', 'epoch': '0.0924', 'num_input_tokens_seen': 7512490, 'train_runtime': '3799', 'train_tokens_per_second': '1977'} +{'loss': '0.989', 'grad_norm': '2.109', 'learning_rate': '4.62e-05', 'epoch': '0.09243', 'num_input_tokens_seen': 7514537, 'train_runtime': '3801', 'train_tokens_per_second': '1977'} +{'loss': '0.3693', 'grad_norm': '1.17', 'learning_rate': '4.621e-05', 'epoch': '0.09245', 'num_input_tokens_seen': 7516584, 'train_runtime': '3802', 'train_tokens_per_second': '1977'} +{'loss': '2.46', 'grad_norm': '5.342', 'learning_rate': '4.622e-05', 'epoch': '0.09248', 'num_input_tokens_seen': 7518631, 'train_runtime': '3803', 'train_tokens_per_second': '1977'} +{'loss': '0.8106', 'grad_norm': '1.721', 'learning_rate': '4.624e-05', 'epoch': '0.0925', 'num_input_tokens_seen': 7520678, 'train_runtime': '3804', 'train_tokens_per_second': '1977'} +{'loss': '1.382', 'grad_norm': '3.024', 'learning_rate': '4.625e-05', 'epoch': '0.09253', 'num_input_tokens_seen': 7522725, 'train_runtime': '3805', 'train_tokens_per_second': '1977'} +{'loss': '0.4769', 'grad_norm': '1.356', 'learning_rate': '4.626e-05', 'epoch': '0.09255', 'num_input_tokens_seen': 7524772, 'train_runtime': '3806', 'train_tokens_per_second': '1977'} +{'loss': '1.35', 'grad_norm': '2.232', 'learning_rate': '4.627e-05', 'epoch': '0.09258', 'num_input_tokens_seen': 7526819, 'train_runtime': '3807', 'train_tokens_per_second': '1977'} +{'loss': '2.268', 'grad_norm': '2.739', 'learning_rate': '4.629e-05', 'epoch': '0.09261', 'num_input_tokens_seen': 7528866, 'train_runtime': '3808', 'train_tokens_per_second': '1977'} +{'loss': '0.4609', 'grad_norm': '1.616', 'learning_rate': '4.63e-05', 'epoch': '0.09263', 'num_input_tokens_seen': 7530913, 'train_runtime': '3809', 'train_tokens_per_second': '1977'} +{'loss': '1.848', 'grad_norm': '2.965', 'learning_rate': '4.631e-05', 'epoch': '0.09266', 'num_input_tokens_seen': 7532960, 'train_runtime': '3810', 'train_tokens_per_second': '1977'} +{'loss': '0.5825', 'grad_norm': '1.57', 'learning_rate': '4.632e-05', 'epoch': '0.09268', 'num_input_tokens_seen': 7535007, 'train_runtime': '3811', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '1.906', 'learning_rate': '4.634e-05', 'epoch': '0.09271', 'num_input_tokens_seen': 7537054, 'train_runtime': '3812', 'train_tokens_per_second': '1977'} +{'loss': '0.6645', 'grad_norm': '2.011', 'learning_rate': '4.635e-05', 'epoch': '0.09273', 'num_input_tokens_seen': 7539101, 'train_runtime': '3813', 'train_tokens_per_second': '1977'} +{'loss': '1.061', 'grad_norm': '2.078', 'learning_rate': '4.636e-05', 'epoch': '0.09276', 'num_input_tokens_seen': 7541148, 'train_runtime': '3814', 'train_tokens_per_second': '1977'} +{'loss': '0.7876', 'grad_norm': '1.752', 'learning_rate': '4.637e-05', 'epoch': '0.09278', 'num_input_tokens_seen': 7543195, 'train_runtime': '3815', 'train_tokens_per_second': '1977'} +{'loss': '1.261', 'grad_norm': '2.191', 'learning_rate': '4.639e-05', 'epoch': '0.09281', 'num_input_tokens_seen': 7545242, 'train_runtime': '3816', 'train_tokens_per_second': '1977'} +{'loss': '0.828', 'grad_norm': '2.126', 'learning_rate': '4.64e-05', 'epoch': '0.09283', 'num_input_tokens_seen': 7547289, 'train_runtime': '3817', 'train_tokens_per_second': '1977'} +{'loss': '1.363', 'grad_norm': '2.225', 'learning_rate': '4.641e-05', 'epoch': '0.09286', 'num_input_tokens_seen': 7549336, 'train_runtime': '3818', 'train_tokens_per_second': '1977'} +{'loss': '0.9553', 'grad_norm': '2.265', 'learning_rate': '4.642e-05', 'epoch': '0.09288', 'num_input_tokens_seen': 7551383, 'train_runtime': '3819', 'train_tokens_per_second': '1977'} +{'loss': '1.93', 'grad_norm': '3.089', 'learning_rate': '4.644e-05', 'epoch': '0.09291', 'num_input_tokens_seen': 7553430, 'train_runtime': '3820', 'train_tokens_per_second': '1977'} +{'loss': '0.9159', 'grad_norm': '1.298', 'learning_rate': '4.645e-05', 'epoch': '0.09293', 'num_input_tokens_seen': 7555477, 'train_runtime': '3821', 'train_tokens_per_second': '1977'} +{'loss': '0.4006', 'grad_norm': '1.41', 'learning_rate': '4.646e-05', 'epoch': '0.09296', 'num_input_tokens_seen': 7557524, 'train_runtime': '3822', 'train_tokens_per_second': '1977'} +{'loss': '2.205', 'grad_norm': '3.379', 'learning_rate': '4.648e-05', 'epoch': '0.09298', 'num_input_tokens_seen': 7559571, 'train_runtime': '3823', 'train_tokens_per_second': '1977'} +{'loss': '0.3728', 'grad_norm': '1.814', 'learning_rate': '4.649e-05', 'epoch': '0.09301', 'num_input_tokens_seen': 7561618, 'train_runtime': '3824', 'train_tokens_per_second': '1977'} +{'loss': '0.8203', 'grad_norm': '1.606', 'learning_rate': '4.65e-05', 'epoch': '0.09303', 'num_input_tokens_seen': 7563665, 'train_runtime': '3825', 'train_tokens_per_second': '1977'} +{'loss': '0.7299', 'grad_norm': '1.743', 'learning_rate': '4.651e-05', 'epoch': '0.09306', 'num_input_tokens_seen': 7565712, 'train_runtime': '3826', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '2.465', 'learning_rate': '4.653e-05', 'epoch': '0.09308', 'num_input_tokens_seen': 7567759, 'train_runtime': '3827', 'train_tokens_per_second': '1977'} +{'loss': '2.257', 'grad_norm': '2.886', 'learning_rate': '4.654e-05', 'epoch': '0.09311', 'num_input_tokens_seen': 7569806, 'train_runtime': '3828', 'train_tokens_per_second': '1977'} +{'loss': '1.823', 'grad_norm': '2.941', 'learning_rate': '4.655e-05', 'epoch': '0.09313', 'num_input_tokens_seen': 7571853, 'train_runtime': '3829', 'train_tokens_per_second': '1977'} +{'loss': '0.787', 'grad_norm': '1.921', 'learning_rate': '4.656e-05', 'epoch': '0.09316', 'num_input_tokens_seen': 7573900, 'train_runtime': '3831', 'train_tokens_per_second': '1977'} +{'loss': '0.6447', 'grad_norm': '1.266', 'learning_rate': '4.658e-05', 'epoch': '0.09318', 'num_input_tokens_seen': 7575947, 'train_runtime': '3832', 'train_tokens_per_second': '1977'} +{'loss': '0.5104', 'grad_norm': '1.83', 'learning_rate': '4.659e-05', 'epoch': '0.09321', 'num_input_tokens_seen': 7577994, 'train_runtime': '3833', 'train_tokens_per_second': '1977'} +{'loss': '0.3944', 'grad_norm': '1.791', 'learning_rate': '4.66e-05', 'epoch': '0.09323', 'num_input_tokens_seen': 7580041, 'train_runtime': '3834', 'train_tokens_per_second': '1977'} +{'loss': '0.6957', 'grad_norm': '1.78', 'learning_rate': '4.661e-05', 'epoch': '0.09326', 'num_input_tokens_seen': 7582088, 'train_runtime': '3835', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '2.564', 'learning_rate': '4.663e-05', 'epoch': '0.09328', 'num_input_tokens_seen': 7584135, 'train_runtime': '3836', 'train_tokens_per_second': '1977'} +{'loss': '0.3446', 'grad_norm': '1.471', 'learning_rate': '4.664e-05', 'epoch': '0.09331', 'num_input_tokens_seen': 7586182, 'train_runtime': '3837', 'train_tokens_per_second': '1977'} +{'loss': '0.8854', 'grad_norm': '2.057', 'learning_rate': '4.665e-05', 'epoch': '0.09334', 'num_input_tokens_seen': 7588229, 'train_runtime': '3838', 'train_tokens_per_second': '1977'} +{'loss': '0.3671', 'grad_norm': '1.562', 'learning_rate': '4.666e-05', 'epoch': '0.09336', 'num_input_tokens_seen': 7590276, 'train_runtime': '3839', 'train_tokens_per_second': '1977'} +{'loss': '1.52', 'grad_norm': '3.049', 'learning_rate': '4.668e-05', 'epoch': '0.09339', 'num_input_tokens_seen': 7592323, 'train_runtime': '3840', 'train_tokens_per_second': '1977'} +{'loss': '0.6972', 'grad_norm': '2.095', 'learning_rate': '4.669e-05', 'epoch': '0.09341', 'num_input_tokens_seen': 7594370, 'train_runtime': '3841', 'train_tokens_per_second': '1977'} +{'loss': '1.494', 'grad_norm': '2.802', 'learning_rate': '4.67e-05', 'epoch': '0.09344', 'num_input_tokens_seen': 7596417, 'train_runtime': '3842', 'train_tokens_per_second': '1977'} +{'loss': '0.9628', 'grad_norm': '2.324', 'learning_rate': '4.671e-05', 'epoch': '0.09346', 'num_input_tokens_seen': 7598464, 'train_runtime': '3843', 'train_tokens_per_second': '1977'} +{'loss': '0.8532', 'grad_norm': '2.178', 'learning_rate': '4.673e-05', 'epoch': '0.09349', 'num_input_tokens_seen': 7600511, 'train_runtime': '3844', 'train_tokens_per_second': '1977'} +{'loss': '0.9145', 'grad_norm': '1.59', 'learning_rate': '4.674e-05', 'epoch': '0.09351', 'num_input_tokens_seen': 7602558, 'train_runtime': '3845', 'train_tokens_per_second': '1977'} +{'loss': '1.703', 'grad_norm': '2.581', 'learning_rate': '4.675e-05', 'epoch': '0.09354', 'num_input_tokens_seen': 7604605, 'train_runtime': '3846', 'train_tokens_per_second': '1977'} +{'loss': '1.247', 'grad_norm': '2.173', 'learning_rate': '4.676e-05', 'epoch': '0.09356', 'num_input_tokens_seen': 7606652, 'train_runtime': '3847', 'train_tokens_per_second': '1977'} +{'loss': '0.4583', 'grad_norm': '1.334', 'learning_rate': '4.678e-05', 'epoch': '0.09359', 'num_input_tokens_seen': 7608699, 'train_runtime': '3848', 'train_tokens_per_second': '1977'} +{'loss': '0.7386', 'grad_norm': '1.822', 'learning_rate': '4.679e-05', 'epoch': '0.09361', 'num_input_tokens_seen': 7610746, 'train_runtime': '3849', 'train_tokens_per_second': '1977'} +{'loss': '0.9288', 'grad_norm': '1.892', 'learning_rate': '4.68e-05', 'epoch': '0.09364', 'num_input_tokens_seen': 7612793, 'train_runtime': '3850', 'train_tokens_per_second': '1977'} +{'loss': '0.9934', 'grad_norm': '1.85', 'learning_rate': '4.682e-05', 'epoch': '0.09366', 'num_input_tokens_seen': 7614840, 'train_runtime': '3851', 'train_tokens_per_second': '1977'} +{'loss': '0.3426', 'grad_norm': '1.452', 'learning_rate': '4.683e-05', 'epoch': '0.09369', 'num_input_tokens_seen': 7616887, 'train_runtime': '3852', 'train_tokens_per_second': '1977'} +{'loss': '0.5021', 'grad_norm': '1.309', 'learning_rate': '4.684e-05', 'epoch': '0.09371', 'num_input_tokens_seen': 7618934, 'train_runtime': '3853', 'train_tokens_per_second': '1977'} +{'loss': '0.559', 'grad_norm': '1.673', 'learning_rate': '4.685e-05', 'epoch': '0.09374', 'num_input_tokens_seen': 7620981, 'train_runtime': '3854', 'train_tokens_per_second': '1977'} +{'loss': '1.469', 'grad_norm': '2.903', 'learning_rate': '4.687e-05', 'epoch': '0.09376', 'num_input_tokens_seen': 7623028, 'train_runtime': '3855', 'train_tokens_per_second': '1977'} +{'loss': '1.56', 'grad_norm': '2.996', 'learning_rate': '4.688e-05', 'epoch': '0.09379', 'num_input_tokens_seen': 7625075, 'train_runtime': '3856', 'train_tokens_per_second': '1977'} +{'loss': '0.9292', 'grad_norm': '2.146', 'learning_rate': '4.689e-05', 'epoch': '0.09381', 'num_input_tokens_seen': 7627122, 'train_runtime': '3857', 'train_tokens_per_second': '1977'} +{'loss': '1.286', 'grad_norm': '2.058', 'learning_rate': '4.69e-05', 'epoch': '0.09384', 'num_input_tokens_seen': 7629169, 'train_runtime': '3858', 'train_tokens_per_second': '1977'} +{'loss': '0.4274', 'grad_norm': '1.067', 'learning_rate': '4.692e-05', 'epoch': '0.09386', 'num_input_tokens_seen': 7631216, 'train_runtime': '3859', 'train_tokens_per_second': '1977'} +{'loss': '0.6643', 'grad_norm': '1.939', 'learning_rate': '4.693e-05', 'epoch': '0.09389', 'num_input_tokens_seen': 7633263, 'train_runtime': '3861', 'train_tokens_per_second': '1977'} +{'loss': '0.9646', 'grad_norm': '2.181', 'learning_rate': '4.694e-05', 'epoch': '0.09391', 'num_input_tokens_seen': 7635310, 'train_runtime': '3862', 'train_tokens_per_second': '1977'} +{'loss': '2.381', 'grad_norm': '3.055', 'learning_rate': '4.695e-05', 'epoch': '0.09394', 'num_input_tokens_seen': 7637357, 'train_runtime': '3863', 'train_tokens_per_second': '1977'} +{'loss': '1.253', 'grad_norm': '2.491', 'learning_rate': '4.697e-05', 'epoch': '0.09396', 'num_input_tokens_seen': 7639404, 'train_runtime': '3864', 'train_tokens_per_second': '1977'} +{'loss': '0.3143', 'grad_norm': '1.482', 'learning_rate': '4.698e-05', 'epoch': '0.09399', 'num_input_tokens_seen': 7641451, 'train_runtime': '3865', 'train_tokens_per_second': '1977'} +{'loss': '1.037', 'grad_norm': '2.024', 'learning_rate': '4.699e-05', 'epoch': '0.09402', 'num_input_tokens_seen': 7643498, 'train_runtime': '3866', 'train_tokens_per_second': '1977'} +{'loss': '0.4883', 'grad_norm': '1.432', 'learning_rate': '4.7e-05', 'epoch': '0.09404', 'num_input_tokens_seen': 7645545, 'train_runtime': '3867', 'train_tokens_per_second': '1977'} +{'loss': '0.6694', 'grad_norm': '1.402', 'learning_rate': '4.702e-05', 'epoch': '0.09407', 'num_input_tokens_seen': 7647592, 'train_runtime': '3868', 'train_tokens_per_second': '1977'} +{'loss': '0.63', 'grad_norm': '1.712', 'learning_rate': '4.703e-05', 'epoch': '0.09409', 'num_input_tokens_seen': 7649639, 'train_runtime': '3869', 'train_tokens_per_second': '1977'} +{'loss': '0.4047', 'grad_norm': '1.309', 'learning_rate': '4.704e-05', 'epoch': '0.09412', 'num_input_tokens_seen': 7651686, 'train_runtime': '3870', 'train_tokens_per_second': '1977'} +{'loss': '0.4623', 'grad_norm': '1.347', 'learning_rate': '4.705e-05', 'epoch': '0.09414', 'num_input_tokens_seen': 7653733, 'train_runtime': '3871', 'train_tokens_per_second': '1977'} +{'loss': '1.503', 'grad_norm': '2.777', 'learning_rate': '4.707e-05', 'epoch': '0.09417', 'num_input_tokens_seen': 7655780, 'train_runtime': '3872', 'train_tokens_per_second': '1977'} +{'loss': '0.3492', 'grad_norm': '1.351', 'learning_rate': '4.708e-05', 'epoch': '0.09419', 'num_input_tokens_seen': 7657827, 'train_runtime': '3873', 'train_tokens_per_second': '1977'} +{'loss': '0.8966', 'grad_norm': '2.371', 'learning_rate': '4.709e-05', 'epoch': '0.09422', 'num_input_tokens_seen': 7659874, 'train_runtime': '3874', 'train_tokens_per_second': '1977'} +{'loss': '0.7026', 'grad_norm': '1.552', 'learning_rate': '4.71e-05', 'epoch': '0.09424', 'num_input_tokens_seen': 7661921, 'train_runtime': '3875', 'train_tokens_per_second': '1977'} +{'loss': '1.879', 'grad_norm': '2.407', 'learning_rate': '4.712e-05', 'epoch': '0.09427', 'num_input_tokens_seen': 7663968, 'train_runtime': '3876', 'train_tokens_per_second': '1977'} +{'loss': '0.6508', 'grad_norm': '1.859', 'learning_rate': '4.713e-05', 'epoch': '0.09429', 'num_input_tokens_seen': 7666015, 'train_runtime': '3877', 'train_tokens_per_second': '1977'} +{'loss': '0.4466', 'grad_norm': '1.362', 'learning_rate': '4.714e-05', 'epoch': '0.09432', 'num_input_tokens_seen': 7668062, 'train_runtime': '3878', 'train_tokens_per_second': '1977'} +{'loss': '0.473', 'grad_norm': '1.661', 'learning_rate': '4.716e-05', 'epoch': '0.09434', 'num_input_tokens_seen': 7670109, 'train_runtime': '3879', 'train_tokens_per_second': '1977'} +{'loss': '0.4077', 'grad_norm': '1.428', 'learning_rate': '4.717e-05', 'epoch': '0.09437', 'num_input_tokens_seen': 7672156, 'train_runtime': '3880', 'train_tokens_per_second': '1977'} +{'loss': '0.4312', 'grad_norm': '1.407', 'learning_rate': '4.718e-05', 'epoch': '0.09439', 'num_input_tokens_seen': 7674203, 'train_runtime': '3881', 'train_tokens_per_second': '1977'} +{'loss': '0.7631', 'grad_norm': '2.194', 'learning_rate': '4.719e-05', 'epoch': '0.09442', 'num_input_tokens_seen': 7676250, 'train_runtime': '3882', 'train_tokens_per_second': '1977'} +{'loss': '1.349', 'grad_norm': '2.327', 'learning_rate': '4.721e-05', 'epoch': '0.09444', 'num_input_tokens_seen': 7678297, 'train_runtime': '3883', 'train_tokens_per_second': '1977'} +{'loss': '0.4253', 'grad_norm': '1.591', 'learning_rate': '4.722e-05', 'epoch': '0.09447', 'num_input_tokens_seen': 7680344, 'train_runtime': '3884', 'train_tokens_per_second': '1977'} +{'loss': '2.433', 'grad_norm': '2.603', 'learning_rate': '4.723e-05', 'epoch': '0.09449', 'num_input_tokens_seen': 7682391, 'train_runtime': '3885', 'train_tokens_per_second': '1977'} +{'loss': '1.435', 'grad_norm': '1.976', 'learning_rate': '4.724e-05', 'epoch': '0.09452', 'num_input_tokens_seen': 7684438, 'train_runtime': '3886', 'train_tokens_per_second': '1977'} +{'loss': '0.5914', 'grad_norm': '1.561', 'learning_rate': '4.726e-05', 'epoch': '0.09454', 'num_input_tokens_seen': 7686485, 'train_runtime': '3887', 'train_tokens_per_second': '1977'} +{'loss': '0.3819', 'grad_norm': '1.656', 'learning_rate': '4.727e-05', 'epoch': '0.09457', 'num_input_tokens_seen': 7688532, 'train_runtime': '3888', 'train_tokens_per_second': '1977'} +{'loss': '0.5131', 'grad_norm': '1.852', 'learning_rate': '4.728e-05', 'epoch': '0.09459', 'num_input_tokens_seen': 7690579, 'train_runtime': '3889', 'train_tokens_per_second': '1977'} +{'loss': '0.9306', 'grad_norm': '1.813', 'learning_rate': '4.729e-05', 'epoch': '0.09462', 'num_input_tokens_seen': 7692626, 'train_runtime': '3891', 'train_tokens_per_second': '1977'} +{'loss': '0.5976', 'grad_norm': '1.827', 'learning_rate': '4.731e-05', 'epoch': '0.09464', 'num_input_tokens_seen': 7694673, 'train_runtime': '3892', 'train_tokens_per_second': '1977'} +{'loss': '0.735', 'grad_norm': '2.396', 'learning_rate': '4.732e-05', 'epoch': '0.09467', 'num_input_tokens_seen': 7696720, 'train_runtime': '3893', 'train_tokens_per_second': '1977'} +{'loss': '0.6714', 'grad_norm': '1.266', 'learning_rate': '4.733e-05', 'epoch': '0.09469', 'num_input_tokens_seen': 7698767, 'train_runtime': '3894', 'train_tokens_per_second': '1977'} +{'loss': '0.8889', 'grad_norm': '1.832', 'learning_rate': '4.734e-05', 'epoch': '0.09472', 'num_input_tokens_seen': 7700814, 'train_runtime': '3895', 'train_tokens_per_second': '1977'} +{'loss': '1.586', 'grad_norm': '2.851', 'learning_rate': '4.736e-05', 'epoch': '0.09475', 'num_input_tokens_seen': 7702861, 'train_runtime': '3896', 'train_tokens_per_second': '1977'} +{'loss': '0.3622', 'grad_norm': '1.437', 'learning_rate': '4.737e-05', 'epoch': '0.09477', 'num_input_tokens_seen': 7704908, 'train_runtime': '3897', 'train_tokens_per_second': '1977'} +{'loss': '0.408', 'grad_norm': '1.273', 'learning_rate': '4.738e-05', 'epoch': '0.0948', 'num_input_tokens_seen': 7706955, 'train_runtime': '3898', 'train_tokens_per_second': '1977'} +{'loss': '1.117', 'grad_norm': '1.928', 'learning_rate': '4.739e-05', 'epoch': '0.09482', 'num_input_tokens_seen': 7709002, 'train_runtime': '3899', 'train_tokens_per_second': '1977'} +{'loss': '0.3527', 'grad_norm': '1.476', 'learning_rate': '4.741e-05', 'epoch': '0.09485', 'num_input_tokens_seen': 7711049, 'train_runtime': '3900', 'train_tokens_per_second': '1977'} +{'loss': '0.7321', 'grad_norm': '1.702', 'learning_rate': '4.742e-05', 'epoch': '0.09487', 'num_input_tokens_seen': 7713096, 'train_runtime': '3901', 'train_tokens_per_second': '1977'} +{'loss': '1.778', 'grad_norm': '2.48', 'learning_rate': '4.743e-05', 'epoch': '0.0949', 'num_input_tokens_seen': 7715143, 'train_runtime': '3902', 'train_tokens_per_second': '1977'} +{'loss': '1.009', 'grad_norm': '2.163', 'learning_rate': '4.744e-05', 'epoch': '0.09492', 'num_input_tokens_seen': 7717190, 'train_runtime': '3903', 'train_tokens_per_second': '1977'} +{'loss': '0.9971', 'grad_norm': '2.126', 'learning_rate': '4.746e-05', 'epoch': '0.09495', 'num_input_tokens_seen': 7719237, 'train_runtime': '3904', 'train_tokens_per_second': '1977'} +{'loss': '2.224', 'grad_norm': '3.805', 'learning_rate': '4.747e-05', 'epoch': '0.09497', 'num_input_tokens_seen': 7721284, 'train_runtime': '3905', 'train_tokens_per_second': '1977'} +{'loss': '1.003', 'grad_norm': '1.98', 'learning_rate': '4.748e-05', 'epoch': '0.095', 'num_input_tokens_seen': 7723331, 'train_runtime': '3906', 'train_tokens_per_second': '1977'} +{'loss': '1.413', 'grad_norm': '2.188', 'learning_rate': '4.749e-05', 'epoch': '0.09502', 'num_input_tokens_seen': 7725378, 'train_runtime': '3907', 'train_tokens_per_second': '1977'} +{'loss': '0.3115', 'grad_norm': '1.169', 'learning_rate': '4.751e-05', 'epoch': '0.09505', 'num_input_tokens_seen': 7727425, 'train_runtime': '3908', 'train_tokens_per_second': '1977'} +{'loss': '0.7422', 'grad_norm': '1.736', 'learning_rate': '4.752e-05', 'epoch': '0.09507', 'num_input_tokens_seen': 7729472, 'train_runtime': '3909', 'train_tokens_per_second': '1977'} +{'loss': '0.6762', 'grad_norm': '1.795', 'learning_rate': '4.753e-05', 'epoch': '0.0951', 'num_input_tokens_seen': 7731519, 'train_runtime': '3910', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.009', 'learning_rate': '4.755e-05', 'epoch': '0.09512', 'num_input_tokens_seen': 7733566, 'train_runtime': '3911', 'train_tokens_per_second': '1977'} +{'loss': '0.8046', 'grad_norm': '2.729', 'learning_rate': '4.756e-05', 'epoch': '0.09515', 'num_input_tokens_seen': 7735613, 'train_runtime': '3912', 'train_tokens_per_second': '1977'} +{'loss': '1.763', 'grad_norm': '2.528', 'learning_rate': '4.757e-05', 'epoch': '0.09517', 'num_input_tokens_seen': 7737660, 'train_runtime': '3913', 'train_tokens_per_second': '1977'} +{'loss': '0.58', 'grad_norm': '1.261', 'learning_rate': '4.758e-05', 'epoch': '0.0952', 'num_input_tokens_seen': 7739707, 'train_runtime': '3914', 'train_tokens_per_second': '1977'} +{'loss': '0.9787', 'grad_norm': '2.488', 'learning_rate': '4.76e-05', 'epoch': '0.09522', 'num_input_tokens_seen': 7741754, 'train_runtime': '3915', 'train_tokens_per_second': '1977'} +{'loss': '0.2815', 'grad_norm': '1.408', 'learning_rate': '4.761e-05', 'epoch': '0.09525', 'num_input_tokens_seen': 7743801, 'train_runtime': '3916', 'train_tokens_per_second': '1977'} +{'loss': '1.186', 'grad_norm': '2.465', 'learning_rate': '4.762e-05', 'epoch': '0.09527', 'num_input_tokens_seen': 7745848, 'train_runtime': '3917', 'train_tokens_per_second': '1977'} +{'loss': '1.564', 'grad_norm': '3.025', 'learning_rate': '4.763e-05', 'epoch': '0.0953', 'num_input_tokens_seen': 7747895, 'train_runtime': '3918', 'train_tokens_per_second': '1977'} +{'loss': '0.9821', 'grad_norm': '2.064', 'learning_rate': '4.765e-05', 'epoch': '0.09532', 'num_input_tokens_seen': 7749942, 'train_runtime': '3920', 'train_tokens_per_second': '1977'} +{'loss': '0.5603', 'grad_norm': '1.67', 'learning_rate': '4.766e-05', 'epoch': '0.09535', 'num_input_tokens_seen': 7751989, 'train_runtime': '3921', 'train_tokens_per_second': '1977'} +{'loss': '0.6755', 'grad_norm': '2.118', 'learning_rate': '4.767e-05', 'epoch': '0.09537', 'num_input_tokens_seen': 7754036, 'train_runtime': '3922', 'train_tokens_per_second': '1977'} +{'loss': '0.7113', 'grad_norm': '1.769', 'learning_rate': '4.768e-05', 'epoch': '0.0954', 'num_input_tokens_seen': 7756083, 'train_runtime': '3923', 'train_tokens_per_second': '1977'} +{'loss': '0.3613', 'grad_norm': '1.785', 'learning_rate': '4.77e-05', 'epoch': '0.09543', 'num_input_tokens_seen': 7758130, 'train_runtime': '3924', 'train_tokens_per_second': '1977'} +{'loss': '0.564', 'grad_norm': '1.681', 'learning_rate': '4.771e-05', 'epoch': '0.09545', 'num_input_tokens_seen': 7760177, 'train_runtime': '3925', 'train_tokens_per_second': '1977'} +{'loss': '1.88', 'grad_norm': '2.667', 'learning_rate': '4.772e-05', 'epoch': '0.09548', 'num_input_tokens_seen': 7762224, 'train_runtime': '3926', 'train_tokens_per_second': '1977'} +{'loss': '0.8284', 'grad_norm': '1.815', 'learning_rate': '4.773e-05', 'epoch': '0.0955', 'num_input_tokens_seen': 7764271, 'train_runtime': '3927', 'train_tokens_per_second': '1977'} +{'loss': '0.6109', 'grad_norm': '2.054', 'learning_rate': '4.775e-05', 'epoch': '0.09553', 'num_input_tokens_seen': 7766318, 'train_runtime': '3928', 'train_tokens_per_second': '1977'} +{'loss': '1.308', 'grad_norm': '2.349', 'learning_rate': '4.776e-05', 'epoch': '0.09555', 'num_input_tokens_seen': 7768365, 'train_runtime': '3929', 'train_tokens_per_second': '1977'} +{'loss': '0.423', 'grad_norm': '1.572', 'learning_rate': '4.777e-05', 'epoch': '0.09558', 'num_input_tokens_seen': 7770412, 'train_runtime': '3930', 'train_tokens_per_second': '1977'} +{'loss': '2.126', 'grad_norm': '2.789', 'learning_rate': '4.778e-05', 'epoch': '0.0956', 'num_input_tokens_seen': 7772459, 'train_runtime': '3931', 'train_tokens_per_second': '1977'} +{'loss': '1.084', 'grad_norm': '2.097', 'learning_rate': '4.78e-05', 'epoch': '0.09563', 'num_input_tokens_seen': 7774506, 'train_runtime': '3932', 'train_tokens_per_second': '1977'} +{'loss': '0.688', 'grad_norm': '1.862', 'learning_rate': '4.781e-05', 'epoch': '0.09565', 'num_input_tokens_seen': 7776553, 'train_runtime': '3933', 'train_tokens_per_second': '1977'} +{'loss': '1.715', 'grad_norm': '2.635', 'learning_rate': '4.782e-05', 'epoch': '0.09568', 'num_input_tokens_seen': 7778600, 'train_runtime': '3934', 'train_tokens_per_second': '1977'} +{'loss': '0.7708', 'grad_norm': '1.712', 'learning_rate': '4.783e-05', 'epoch': '0.0957', 'num_input_tokens_seen': 7780647, 'train_runtime': '3935', 'train_tokens_per_second': '1977'} +{'loss': '0.9012', 'grad_norm': '2.054', 'learning_rate': '4.785e-05', 'epoch': '0.09573', 'num_input_tokens_seen': 7782694, 'train_runtime': '3936', 'train_tokens_per_second': '1977'} +{'loss': '1.207', 'grad_norm': '2.065', 'learning_rate': '4.786e-05', 'epoch': '0.09575', 'num_input_tokens_seen': 7784741, 'train_runtime': '3937', 'train_tokens_per_second': '1977'} +{'loss': '0.5132', 'grad_norm': '1.544', 'learning_rate': '4.787e-05', 'epoch': '0.09578', 'num_input_tokens_seen': 7786788, 'train_runtime': '3938', 'train_tokens_per_second': '1977'} +{'loss': '2.234', 'grad_norm': '2.982', 'learning_rate': '4.789e-05', 'epoch': '0.0958', 'num_input_tokens_seen': 7788835, 'train_runtime': '3939', 'train_tokens_per_second': '1977'} +{'loss': '0.8618', 'grad_norm': '1.974', 'learning_rate': '4.79e-05', 'epoch': '0.09583', 'num_input_tokens_seen': 7790882, 'train_runtime': '3940', 'train_tokens_per_second': '1977'} +{'loss': '1.103', 'grad_norm': '2.403', 'learning_rate': '4.791e-05', 'epoch': '0.09585', 'num_input_tokens_seen': 7792929, 'train_runtime': '3941', 'train_tokens_per_second': '1977'} +{'loss': '2.281', 'grad_norm': '3.792', 'learning_rate': '4.792e-05', 'epoch': '0.09588', 'num_input_tokens_seen': 7794976, 'train_runtime': '3942', 'train_tokens_per_second': '1977'} +{'loss': '0.7343', 'grad_norm': '2.474', 'learning_rate': '4.794e-05', 'epoch': '0.0959', 'num_input_tokens_seen': 7797023, 'train_runtime': '3943', 'train_tokens_per_second': '1977'} +{'loss': '0.7179', 'grad_norm': '1.623', 'learning_rate': '4.795e-05', 'epoch': '0.09593', 'num_input_tokens_seen': 7799070, 'train_runtime': '3944', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '2.032', 'learning_rate': '4.796e-05', 'epoch': '0.09595', 'num_input_tokens_seen': 7801117, 'train_runtime': '3945', 'train_tokens_per_second': '1977'} +{'loss': '0.8918', 'grad_norm': '1.749', 'learning_rate': '4.797e-05', 'epoch': '0.09598', 'num_input_tokens_seen': 7803164, 'train_runtime': '3946', 'train_tokens_per_second': '1977'} +{'loss': '0.5391', 'grad_norm': '1.706', 'learning_rate': '4.799e-05', 'epoch': '0.096', 'num_input_tokens_seen': 7805211, 'train_runtime': '3947', 'train_tokens_per_second': '1977'} +{'loss': '0.3806', 'grad_norm': '1.461', 'learning_rate': '4.8e-05', 'epoch': '0.09603', 'num_input_tokens_seen': 7807258, 'train_runtime': '3948', 'train_tokens_per_second': '1977'} +{'loss': '0.564', 'grad_norm': '1.597', 'learning_rate': '4.801e-05', 'epoch': '0.09605', 'num_input_tokens_seen': 7809305, 'train_runtime': '3949', 'train_tokens_per_second': '1977'} +{'loss': '0.3321', 'grad_norm': '1.152', 'learning_rate': '4.802e-05', 'epoch': '0.09608', 'num_input_tokens_seen': 7811352, 'train_runtime': '3951', 'train_tokens_per_second': '1977'} +{'loss': '0.9399', 'grad_norm': '1.588', 'learning_rate': '4.804e-05', 'epoch': '0.0961', 'num_input_tokens_seen': 7813399, 'train_runtime': '3952', 'train_tokens_per_second': '1977'} +{'loss': '0.8847', 'grad_norm': '3.319', 'learning_rate': '4.805e-05', 'epoch': '0.09613', 'num_input_tokens_seen': 7815446, 'train_runtime': '3953', 'train_tokens_per_second': '1977'} +{'loss': '0.3557', 'grad_norm': '1.61', 'learning_rate': '4.806e-05', 'epoch': '0.09616', 'num_input_tokens_seen': 7817493, 'train_runtime': '3954', 'train_tokens_per_second': '1977'} +{'loss': '0.6646', 'grad_norm': '1.705', 'learning_rate': '4.807e-05', 'epoch': '0.09618', 'num_input_tokens_seen': 7819540, 'train_runtime': '3955', 'train_tokens_per_second': '1977'} +{'loss': '1.849', 'grad_norm': '3.075', 'learning_rate': '4.809e-05', 'epoch': '0.09621', 'num_input_tokens_seen': 7821587, 'train_runtime': '3956', 'train_tokens_per_second': '1977'} +{'loss': '0.4727', 'grad_norm': '1.154', 'learning_rate': '4.81e-05', 'epoch': '0.09623', 'num_input_tokens_seen': 7823634, 'train_runtime': '3957', 'train_tokens_per_second': '1977'} +{'loss': '0.4504', 'grad_norm': '1.54', 'learning_rate': '4.811e-05', 'epoch': '0.09626', 'num_input_tokens_seen': 7825681, 'train_runtime': '3958', 'train_tokens_per_second': '1977'} +{'loss': '0.7439', 'grad_norm': '1.959', 'learning_rate': '4.812e-05', 'epoch': '0.09628', 'num_input_tokens_seen': 7827728, 'train_runtime': '3959', 'train_tokens_per_second': '1977'} +{'loss': '0.6246', 'grad_norm': '1.692', 'learning_rate': '4.814e-05', 'epoch': '0.09631', 'num_input_tokens_seen': 7829775, 'train_runtime': '3960', 'train_tokens_per_second': '1977'} +{'loss': '0.8324', 'grad_norm': '2.46', 'learning_rate': '4.815e-05', 'epoch': '0.09633', 'num_input_tokens_seen': 7831822, 'train_runtime': '3961', 'train_tokens_per_second': '1977'} +{'loss': '0.3849', 'grad_norm': '1.499', 'learning_rate': '4.816e-05', 'epoch': '0.09636', 'num_input_tokens_seen': 7833869, 'train_runtime': '3962', 'train_tokens_per_second': '1977'} +{'loss': '0.8351', 'grad_norm': '1.722', 'learning_rate': '4.817e-05', 'epoch': '0.09638', 'num_input_tokens_seen': 7835916, 'train_runtime': '3963', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '2.273', 'learning_rate': '4.819e-05', 'epoch': '0.09641', 'num_input_tokens_seen': 7837963, 'train_runtime': '3964', 'train_tokens_per_second': '1977'} +{'loss': '1.323', 'grad_norm': '2.372', 'learning_rate': '4.82e-05', 'epoch': '0.09643', 'num_input_tokens_seen': 7840010, 'train_runtime': '3965', 'train_tokens_per_second': '1977'} +{'loss': '0.4655', 'grad_norm': '1.591', 'learning_rate': '4.821e-05', 'epoch': '0.09646', 'num_input_tokens_seen': 7842057, 'train_runtime': '3966', 'train_tokens_per_second': '1977'} +{'loss': '1.441', 'grad_norm': '2.955', 'learning_rate': '4.823e-05', 'epoch': '0.09648', 'num_input_tokens_seen': 7844104, 'train_runtime': '3967', 'train_tokens_per_second': '1977'} +{'loss': '1.462', 'grad_norm': '3', 'learning_rate': '4.824e-05', 'epoch': '0.09651', 'num_input_tokens_seen': 7846151, 'train_runtime': '3968', 'train_tokens_per_second': '1977'} +{'loss': '0.4311', 'grad_norm': '1.474', 'learning_rate': '4.825e-05', 'epoch': '0.09653', 'num_input_tokens_seen': 7848198, 'train_runtime': '3969', 'train_tokens_per_second': '1977'} +{'loss': '0.6151', 'grad_norm': '1.827', 'learning_rate': '4.826e-05', 'epoch': '0.09656', 'num_input_tokens_seen': 7850245, 'train_runtime': '3970', 'train_tokens_per_second': '1977'} +{'loss': '1.227', 'grad_norm': '2.444', 'learning_rate': '4.828e-05', 'epoch': '0.09658', 'num_input_tokens_seen': 7852292, 'train_runtime': '3971', 'train_tokens_per_second': '1977'} +{'loss': '0.5202', 'grad_norm': '1.597', 'learning_rate': '4.829e-05', 'epoch': '0.09661', 'num_input_tokens_seen': 7854339, 'train_runtime': '3972', 'train_tokens_per_second': '1977'} +{'loss': '0.4474', 'grad_norm': '1.417', 'learning_rate': '4.83e-05', 'epoch': '0.09663', 'num_input_tokens_seen': 7856386, 'train_runtime': '3973', 'train_tokens_per_second': '1977'} +{'loss': '0.3074', 'grad_norm': '1.632', 'learning_rate': '4.831e-05', 'epoch': '0.09666', 'num_input_tokens_seen': 7858433, 'train_runtime': '3974', 'train_tokens_per_second': '1977'} +{'loss': '1.679', 'grad_norm': '2.713', 'learning_rate': '4.833e-05', 'epoch': '0.09668', 'num_input_tokens_seen': 7860480, 'train_runtime': '3975', 'train_tokens_per_second': '1977'} +{'loss': '1.558', 'grad_norm': '2.739', 'learning_rate': '4.834e-05', 'epoch': '0.09671', 'num_input_tokens_seen': 7862527, 'train_runtime': '3976', 'train_tokens_per_second': '1977'} +{'loss': '0.7085', 'grad_norm': '1.22', 'learning_rate': '4.835e-05', 'epoch': '0.09673', 'num_input_tokens_seen': 7864574, 'train_runtime': '3977', 'train_tokens_per_second': '1977'} +{'loss': '2.116', 'grad_norm': '3.417', 'learning_rate': '4.836e-05', 'epoch': '0.09676', 'num_input_tokens_seen': 7866621, 'train_runtime': '3978', 'train_tokens_per_second': '1977'} +{'loss': '0.7925', 'grad_norm': '2.177', 'learning_rate': '4.838e-05', 'epoch': '0.09678', 'num_input_tokens_seen': 7868668, 'train_runtime': '3979', 'train_tokens_per_second': '1977'} +{'loss': '0.6147', 'grad_norm': '1.461', 'learning_rate': '4.839e-05', 'epoch': '0.09681', 'num_input_tokens_seen': 7870715, 'train_runtime': '3981', 'train_tokens_per_second': '1977'} +{'loss': '1.978', 'grad_norm': '3.079', 'learning_rate': '4.84e-05', 'epoch': '0.09684', 'num_input_tokens_seen': 7872762, 'train_runtime': '3982', 'train_tokens_per_second': '1977'} +{'loss': '0.9237', 'grad_norm': '1.886', 'learning_rate': '4.841e-05', 'epoch': '0.09686', 'num_input_tokens_seen': 7874809, 'train_runtime': '3983', 'train_tokens_per_second': '1977'} +{'loss': '1.226', 'grad_norm': '2.196', 'learning_rate': '4.843e-05', 'epoch': '0.09689', 'num_input_tokens_seen': 7876856, 'train_runtime': '3984', 'train_tokens_per_second': '1977'} +{'loss': '2.257', 'grad_norm': '3.062', 'learning_rate': '4.844e-05', 'epoch': '0.09691', 'num_input_tokens_seen': 7878903, 'train_runtime': '3985', 'train_tokens_per_second': '1977'} +{'loss': '0.8085', 'grad_norm': '1.791', 'learning_rate': '4.845e-05', 'epoch': '0.09694', 'num_input_tokens_seen': 7880950, 'train_runtime': '3986', 'train_tokens_per_second': '1977'} +{'loss': '0.8461', 'grad_norm': '1.553', 'learning_rate': '4.846e-05', 'epoch': '0.09696', 'num_input_tokens_seen': 7882997, 'train_runtime': '3987', 'train_tokens_per_second': '1977'} +{'loss': '0.2986', 'grad_norm': '1.309', 'learning_rate': '4.848e-05', 'epoch': '0.09699', 'num_input_tokens_seen': 7885044, 'train_runtime': '3988', 'train_tokens_per_second': '1977'} +{'loss': '1.057', 'grad_norm': '2.543', 'learning_rate': '4.849e-05', 'epoch': '0.09701', 'num_input_tokens_seen': 7887091, 'train_runtime': '3989', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '2.181', 'learning_rate': '4.85e-05', 'epoch': '0.09704', 'num_input_tokens_seen': 7889138, 'train_runtime': '3990', 'train_tokens_per_second': '1977'} +{'loss': '0.773', 'grad_norm': '1.929', 'learning_rate': '4.851e-05', 'epoch': '0.09706', 'num_input_tokens_seen': 7891185, 'train_runtime': '3991', 'train_tokens_per_second': '1977'} +{'loss': '0.9104', 'grad_norm': '1.924', 'learning_rate': '4.853e-05', 'epoch': '0.09709', 'num_input_tokens_seen': 7893232, 'train_runtime': '3992', 'train_tokens_per_second': '1977'} +{'loss': '0.4088', 'grad_norm': '1.311', 'learning_rate': '4.854e-05', 'epoch': '0.09711', 'num_input_tokens_seen': 7895279, 'train_runtime': '3993', 'train_tokens_per_second': '1977'} +{'loss': '0.5572', 'grad_norm': '1.957', 'learning_rate': '4.855e-05', 'epoch': '0.09714', 'num_input_tokens_seen': 7897326, 'train_runtime': '3994', 'train_tokens_per_second': '1977'} +{'loss': '0.4378', 'grad_norm': '1.583', 'learning_rate': '4.856e-05', 'epoch': '0.09716', 'num_input_tokens_seen': 7899373, 'train_runtime': '3995', 'train_tokens_per_second': '1977'} +{'loss': '1.423', 'grad_norm': '2.326', 'learning_rate': '4.858e-05', 'epoch': '0.09719', 'num_input_tokens_seen': 7901420, 'train_runtime': '3996', 'train_tokens_per_second': '1977'} +{'loss': '1.066', 'grad_norm': '2.633', 'learning_rate': '4.859e-05', 'epoch': '0.09721', 'num_input_tokens_seen': 7903467, 'train_runtime': '3997', 'train_tokens_per_second': '1977'} +{'loss': '1.254', 'grad_norm': '2.345', 'learning_rate': '4.86e-05', 'epoch': '0.09724', 'num_input_tokens_seen': 7905514, 'train_runtime': '3998', 'train_tokens_per_second': '1977'} +{'loss': '0.7728', 'grad_norm': '1.878', 'learning_rate': '4.862e-05', 'epoch': '0.09726', 'num_input_tokens_seen': 7907561, 'train_runtime': '3999', 'train_tokens_per_second': '1977'} +{'loss': '0.4108', 'grad_norm': '1.554', 'learning_rate': '4.863e-05', 'epoch': '0.09729', 'num_input_tokens_seen': 7909608, 'train_runtime': '4000', 'train_tokens_per_second': '1977'} +{'loss': '0.4577', 'grad_norm': '1.644', 'learning_rate': '4.864e-05', 'epoch': '0.09731', 'num_input_tokens_seen': 7911655, 'train_runtime': '4001', 'train_tokens_per_second': '1977'} +{'loss': '0.8357', 'grad_norm': '1.824', 'learning_rate': '4.865e-05', 'epoch': '0.09734', 'num_input_tokens_seen': 7913702, 'train_runtime': '4002', 'train_tokens_per_second': '1977'} +{'loss': '2.552', 'grad_norm': '3.184', 'learning_rate': '4.867e-05', 'epoch': '0.09736', 'num_input_tokens_seen': 7915749, 'train_runtime': '4003', 'train_tokens_per_second': '1977'} +{'loss': '0.4007', 'grad_norm': '1.488', 'learning_rate': '4.868e-05', 'epoch': '0.09739', 'num_input_tokens_seen': 7917796, 'train_runtime': '4004', 'train_tokens_per_second': '1977'} +{'loss': '2.074', 'grad_norm': '2.44', 'learning_rate': '4.869e-05', 'epoch': '0.09741', 'num_input_tokens_seen': 7919843, 'train_runtime': '4005', 'train_tokens_per_second': '1977'} +{'loss': '0.3469', 'grad_norm': '1.303', 'learning_rate': '4.87e-05', 'epoch': '0.09744', 'num_input_tokens_seen': 7921890, 'train_runtime': '4006', 'train_tokens_per_second': '1977'} +{'loss': '0.8269', 'grad_norm': '2.244', 'learning_rate': '4.872e-05', 'epoch': '0.09746', 'num_input_tokens_seen': 7923937, 'train_runtime': '4007', 'train_tokens_per_second': '1977'} +{'loss': '0.3297', 'grad_norm': '1.292', 'learning_rate': '4.873e-05', 'epoch': '0.09749', 'num_input_tokens_seen': 7925984, 'train_runtime': '4008', 'train_tokens_per_second': '1977'} +{'loss': '0.9639', 'grad_norm': '1.843', 'learning_rate': '4.874e-05', 'epoch': '0.09751', 'num_input_tokens_seen': 7928031, 'train_runtime': '4010', 'train_tokens_per_second': '1977'} +{'loss': '1.238', 'grad_norm': '2.747', 'learning_rate': '4.875e-05', 'epoch': '0.09754', 'num_input_tokens_seen': 7930078, 'train_runtime': '4011', 'train_tokens_per_second': '1977'} +{'loss': '0.4156', 'grad_norm': '1.464', 'learning_rate': '4.877e-05', 'epoch': '0.09757', 'num_input_tokens_seen': 7932125, 'train_runtime': '4012', 'train_tokens_per_second': '1977'} +{'loss': '1.604', 'grad_norm': '2.882', 'learning_rate': '4.878e-05', 'epoch': '0.09759', 'num_input_tokens_seen': 7934172, 'train_runtime': '4013', 'train_tokens_per_second': '1977'} +{'loss': '1.177', 'grad_norm': '2.073', 'learning_rate': '4.879e-05', 'epoch': '0.09762', 'num_input_tokens_seen': 7936219, 'train_runtime': '4014', 'train_tokens_per_second': '1977'} +{'loss': '0.9424', 'grad_norm': '2.337', 'learning_rate': '4.88e-05', 'epoch': '0.09764', 'num_input_tokens_seen': 7938266, 'train_runtime': '4015', 'train_tokens_per_second': '1977'} +{'loss': '1.248', 'grad_norm': '2.154', 'learning_rate': '4.882e-05', 'epoch': '0.09767', 'num_input_tokens_seen': 7940313, 'train_runtime': '4016', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '2.112', 'learning_rate': '4.883e-05', 'epoch': '0.09769', 'num_input_tokens_seen': 7942360, 'train_runtime': '4017', 'train_tokens_per_second': '1977'} +{'loss': '0.4023', 'grad_norm': '1.511', 'learning_rate': '4.884e-05', 'epoch': '0.09772', 'num_input_tokens_seen': 7944407, 'train_runtime': '4018', 'train_tokens_per_second': '1977'} +{'loss': '1.243', 'grad_norm': '2.45', 'learning_rate': '4.885e-05', 'epoch': '0.09774', 'num_input_tokens_seen': 7946454, 'train_runtime': '4019', 'train_tokens_per_second': '1977'} +{'loss': '2.532', 'grad_norm': '2.813', 'learning_rate': '4.887e-05', 'epoch': '0.09777', 'num_input_tokens_seen': 7948501, 'train_runtime': '4020', 'train_tokens_per_second': '1977'} +{'loss': '0.3812', 'grad_norm': '1.47', 'learning_rate': '4.888e-05', 'epoch': '0.09779', 'num_input_tokens_seen': 7950548, 'train_runtime': '4021', 'train_tokens_per_second': '1977'} +{'loss': '0.4873', 'grad_norm': '1.997', 'learning_rate': '4.889e-05', 'epoch': '0.09782', 'num_input_tokens_seen': 7952595, 'train_runtime': '4022', 'train_tokens_per_second': '1977'} +{'loss': '1.224', 'grad_norm': '2.26', 'learning_rate': '4.89e-05', 'epoch': '0.09784', 'num_input_tokens_seen': 7954642, 'train_runtime': '4023', 'train_tokens_per_second': '1977'} +{'loss': '0.4009', 'grad_norm': '1.337', 'learning_rate': '4.892e-05', 'epoch': '0.09787', 'num_input_tokens_seen': 7956689, 'train_runtime': '4024', 'train_tokens_per_second': '1977'} +{'loss': '1.894', 'grad_norm': '2.715', 'learning_rate': '4.893e-05', 'epoch': '0.09789', 'num_input_tokens_seen': 7958736, 'train_runtime': '4025', 'train_tokens_per_second': '1977'} +{'loss': '0.4646', 'grad_norm': '1.341', 'learning_rate': '4.894e-05', 'epoch': '0.09792', 'num_input_tokens_seen': 7960783, 'train_runtime': '4026', 'train_tokens_per_second': '1977'} +{'loss': '1.691', 'grad_norm': '2.701', 'learning_rate': '4.896e-05', 'epoch': '0.09794', 'num_input_tokens_seen': 7962830, 'train_runtime': '4027', 'train_tokens_per_second': '1977'} +{'loss': '0.3289', 'grad_norm': '1.081', 'learning_rate': '4.897e-05', 'epoch': '0.09797', 'num_input_tokens_seen': 7964877, 'train_runtime': '4028', 'train_tokens_per_second': '1977'} +{'loss': '0.804', 'grad_norm': '2.272', 'learning_rate': '4.898e-05', 'epoch': '0.09799', 'num_input_tokens_seen': 7966924, 'train_runtime': '4029', 'train_tokens_per_second': '1977'} +{'loss': '0.69', 'grad_norm': '1.706', 'learning_rate': '4.899e-05', 'epoch': '0.09802', 'num_input_tokens_seen': 7968971, 'train_runtime': '4030', 'train_tokens_per_second': '1977'} +{'loss': '0.333', 'grad_norm': '1.5', 'learning_rate': '4.901e-05', 'epoch': '0.09804', 'num_input_tokens_seen': 7971018, 'train_runtime': '4031', 'train_tokens_per_second': '1977'} +{'loss': '0.802', 'grad_norm': '1.922', 'learning_rate': '4.902e-05', 'epoch': '0.09807', 'num_input_tokens_seen': 7973065, 'train_runtime': '4032', 'train_tokens_per_second': '1977'} +{'loss': '0.9736', 'grad_norm': '1.93', 'learning_rate': '4.903e-05', 'epoch': '0.09809', 'num_input_tokens_seen': 7975112, 'train_runtime': '4033', 'train_tokens_per_second': '1977'} +{'loss': '0.8027', 'grad_norm': '1.975', 'learning_rate': '4.904e-05', 'epoch': '0.09812', 'num_input_tokens_seen': 7977159, 'train_runtime': '4034', 'train_tokens_per_second': '1977'} +{'loss': '0.8927', 'grad_norm': '2.428', 'learning_rate': '4.906e-05', 'epoch': '0.09814', 'num_input_tokens_seen': 7979206, 'train_runtime': '4035', 'train_tokens_per_second': '1977'} +{'loss': '1.14', 'grad_norm': '2.268', 'learning_rate': '4.907e-05', 'epoch': '0.09817', 'num_input_tokens_seen': 7981253, 'train_runtime': '4036', 'train_tokens_per_second': '1977'} +{'loss': '1.068', 'grad_norm': '2.235', 'learning_rate': '4.908e-05', 'epoch': '0.09819', 'num_input_tokens_seen': 7983300, 'train_runtime': '4037', 'train_tokens_per_second': '1977'} +{'loss': '0.7857', 'grad_norm': '2.087', 'learning_rate': '4.909e-05', 'epoch': '0.09822', 'num_input_tokens_seen': 7985347, 'train_runtime': '4038', 'train_tokens_per_second': '1977'} +{'loss': '1.192', 'grad_norm': '2.579', 'learning_rate': '4.911e-05', 'epoch': '0.09825', 'num_input_tokens_seen': 7987394, 'train_runtime': '4040', 'train_tokens_per_second': '1977'} +{'loss': '0.3728', 'grad_norm': '1.403', 'learning_rate': '4.912e-05', 'epoch': '0.09827', 'num_input_tokens_seen': 7989441, 'train_runtime': '4041', 'train_tokens_per_second': '1977'} +{'loss': '0.5988', 'grad_norm': '1.547', 'learning_rate': '4.913e-05', 'epoch': '0.0983', 'num_input_tokens_seen': 7991488, 'train_runtime': '4042', 'train_tokens_per_second': '1977'} +{'loss': '0.455', 'grad_norm': '1.481', 'learning_rate': '4.914e-05', 'epoch': '0.09832', 'num_input_tokens_seen': 7993535, 'train_runtime': '4043', 'train_tokens_per_second': '1977'} +{'loss': '0.7796', 'grad_norm': '2.416', 'learning_rate': '4.916e-05', 'epoch': '0.09835', 'num_input_tokens_seen': 7995582, 'train_runtime': '4044', 'train_tokens_per_second': '1977'} +{'loss': '1.427', 'grad_norm': '2.606', 'learning_rate': '4.917e-05', 'epoch': '0.09837', 'num_input_tokens_seen': 7997629, 'train_runtime': '4045', 'train_tokens_per_second': '1977'} +{'loss': '0.6069', 'grad_norm': '1.755', 'learning_rate': '4.918e-05', 'epoch': '0.0984', 'num_input_tokens_seen': 7999676, 'train_runtime': '4046', 'train_tokens_per_second': '1977'} +{'loss': '0.4267', 'grad_norm': '1.716', 'learning_rate': '4.919e-05', 'epoch': '0.09842', 'num_input_tokens_seen': 8001723, 'train_runtime': '4047', 'train_tokens_per_second': '1977'} +{'loss': '0.9558', 'grad_norm': '2.229', 'learning_rate': '4.921e-05', 'epoch': '0.09845', 'num_input_tokens_seen': 8003770, 'train_runtime': '4048', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.403', 'learning_rate': '4.922e-05', 'epoch': '0.09847', 'num_input_tokens_seen': 8005817, 'train_runtime': '4049', 'train_tokens_per_second': '1977'} +{'loss': '0.8763', 'grad_norm': '1.543', 'learning_rate': '4.923e-05', 'epoch': '0.0985', 'num_input_tokens_seen': 8007864, 'train_runtime': '4050', 'train_tokens_per_second': '1977'} +{'loss': '0.6922', 'grad_norm': '1.605', 'learning_rate': '4.924e-05', 'epoch': '0.09852', 'num_input_tokens_seen': 8009911, 'train_runtime': '4051', 'train_tokens_per_second': '1977'} +{'loss': '0.7437', 'grad_norm': '1.662', 'learning_rate': '4.926e-05', 'epoch': '0.09855', 'num_input_tokens_seen': 8011958, 'train_runtime': '4052', 'train_tokens_per_second': '1977'} +{'loss': '1.148', 'grad_norm': '2.374', 'learning_rate': '4.927e-05', 'epoch': '0.09857', 'num_input_tokens_seen': 8014005, 'train_runtime': '4053', 'train_tokens_per_second': '1977'} +{'loss': '0.4541', 'grad_norm': '1.398', 'learning_rate': '4.928e-05', 'epoch': '0.0986', 'num_input_tokens_seen': 8016052, 'train_runtime': '4054', 'train_tokens_per_second': '1977'} +{'loss': '2.029', 'grad_norm': '3.433', 'learning_rate': '4.93e-05', 'epoch': '0.09862', 'num_input_tokens_seen': 8018099, 'train_runtime': '4055', 'train_tokens_per_second': '1977'} +{'loss': '1.85', 'grad_norm': '2.892', 'learning_rate': '4.931e-05', 'epoch': '0.09865', 'num_input_tokens_seen': 8020146, 'train_runtime': '4056', 'train_tokens_per_second': '1977'} +{'loss': '0.3477', 'grad_norm': '1.19', 'learning_rate': '4.932e-05', 'epoch': '0.09867', 'num_input_tokens_seen': 8022193, 'train_runtime': '4057', 'train_tokens_per_second': '1977'} +{'loss': '1.355', 'grad_norm': '1.968', 'learning_rate': '4.933e-05', 'epoch': '0.0987', 'num_input_tokens_seen': 8024240, 'train_runtime': '4058', 'train_tokens_per_second': '1977'} +{'loss': '0.562', 'grad_norm': '1.602', 'learning_rate': '4.935e-05', 'epoch': '0.09872', 'num_input_tokens_seen': 8026287, 'train_runtime': '4059', 'train_tokens_per_second': '1977'} +{'loss': '1.306', 'grad_norm': '2.05', 'learning_rate': '4.936e-05', 'epoch': '0.09875', 'num_input_tokens_seen': 8028334, 'train_runtime': '4060', 'train_tokens_per_second': '1977'} +{'loss': '1.114', 'grad_norm': '2.284', 'learning_rate': '4.937e-05', 'epoch': '0.09877', 'num_input_tokens_seen': 8030381, 'train_runtime': '4061', 'train_tokens_per_second': '1977'} +{'loss': '0.4343', 'grad_norm': '1.173', 'learning_rate': '4.938e-05', 'epoch': '0.0988', 'num_input_tokens_seen': 8032428, 'train_runtime': '4062', 'train_tokens_per_second': '1977'} +{'loss': '1.602', 'grad_norm': '2.565', 'learning_rate': '4.94e-05', 'epoch': '0.09882', 'num_input_tokens_seen': 8034475, 'train_runtime': '4063', 'train_tokens_per_second': '1977'} +{'loss': '1.543', 'grad_norm': '2.879', 'learning_rate': '4.941e-05', 'epoch': '0.09885', 'num_input_tokens_seen': 8036522, 'train_runtime': '4064', 'train_tokens_per_second': '1977'} +{'loss': '0.9961', 'grad_norm': '2.056', 'learning_rate': '4.942e-05', 'epoch': '0.09887', 'num_input_tokens_seen': 8038569, 'train_runtime': '4065', 'train_tokens_per_second': '1977'} +{'loss': '0.9924', 'grad_norm': '1.741', 'learning_rate': '4.943e-05', 'epoch': '0.0989', 'num_input_tokens_seen': 8040616, 'train_runtime': '4066', 'train_tokens_per_second': '1977'} +{'loss': '0.4103', 'grad_norm': '1.274', 'learning_rate': '4.945e-05', 'epoch': '0.09892', 'num_input_tokens_seen': 8042663, 'train_runtime': '4067', 'train_tokens_per_second': '1977'} +{'loss': '0.8225', 'grad_norm': '2.064', 'learning_rate': '4.946e-05', 'epoch': '0.09895', 'num_input_tokens_seen': 8044710, 'train_runtime': '4068', 'train_tokens_per_second': '1977'} +{'loss': '0.7793', 'grad_norm': '2.008', 'learning_rate': '4.947e-05', 'epoch': '0.09898', 'num_input_tokens_seen': 8046757, 'train_runtime': '4070', 'train_tokens_per_second': '1977'} +{'loss': '0.7815', 'grad_norm': '1.818', 'learning_rate': '4.948e-05', 'epoch': '0.099', 'num_input_tokens_seen': 8048804, 'train_runtime': '4071', 'train_tokens_per_second': '1977'} +{'loss': '0.5945', 'grad_norm': '1.737', 'learning_rate': '4.95e-05', 'epoch': '0.09903', 'num_input_tokens_seen': 8050851, 'train_runtime': '4072', 'train_tokens_per_second': '1977'} +{'loss': '0.8164', 'grad_norm': '2.037', 'learning_rate': '4.951e-05', 'epoch': '0.09905', 'num_input_tokens_seen': 8052898, 'train_runtime': '4073', 'train_tokens_per_second': '1977'} +{'loss': '0.3415', 'grad_norm': '1.337', 'learning_rate': '4.952e-05', 'epoch': '0.09908', 'num_input_tokens_seen': 8054945, 'train_runtime': '4074', 'train_tokens_per_second': '1977'} +{'loss': '0.5438', 'grad_norm': '1.764', 'learning_rate': '4.953e-05', 'epoch': '0.0991', 'num_input_tokens_seen': 8056992, 'train_runtime': '4075', 'train_tokens_per_second': '1977'} +{'loss': '0.6221', 'grad_norm': '1.671', 'learning_rate': '4.955e-05', 'epoch': '0.09913', 'num_input_tokens_seen': 8059039, 'train_runtime': '4076', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '2.297', 'learning_rate': '4.956e-05', 'epoch': '0.09915', 'num_input_tokens_seen': 8061086, 'train_runtime': '4077', 'train_tokens_per_second': '1977'} +{'loss': '0.555', 'grad_norm': '1.743', 'learning_rate': '4.957e-05', 'epoch': '0.09918', 'num_input_tokens_seen': 8063133, 'train_runtime': '4078', 'train_tokens_per_second': '1977'} +{'loss': '0.4255', 'grad_norm': '1.253', 'learning_rate': '4.958e-05', 'epoch': '0.0992', 'num_input_tokens_seen': 8065180, 'train_runtime': '4079', 'train_tokens_per_second': '1977'} +{'loss': '0.8154', 'grad_norm': '1.717', 'learning_rate': '4.96e-05', 'epoch': '0.09923', 'num_input_tokens_seen': 8067227, 'train_runtime': '4080', 'train_tokens_per_second': '1977'} +{'loss': '0.6637', 'grad_norm': '1.743', 'learning_rate': '4.961e-05', 'epoch': '0.09925', 'num_input_tokens_seen': 8069274, 'train_runtime': '4081', 'train_tokens_per_second': '1977'} +{'loss': '0.9121', 'grad_norm': '2.025', 'learning_rate': '4.962e-05', 'epoch': '0.09928', 'num_input_tokens_seen': 8071321, 'train_runtime': '4082', 'train_tokens_per_second': '1977'} +{'loss': '0.5232', 'grad_norm': '1.362', 'learning_rate': '4.963e-05', 'epoch': '0.0993', 'num_input_tokens_seen': 8073368, 'train_runtime': '4083', 'train_tokens_per_second': '1977'} +{'loss': '1.847', 'grad_norm': '2.889', 'learning_rate': '4.965e-05', 'epoch': '0.09933', 'num_input_tokens_seen': 8075415, 'train_runtime': '4084', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '1.869', 'learning_rate': '4.966e-05', 'epoch': '0.09935', 'num_input_tokens_seen': 8077462, 'train_runtime': '4085', 'train_tokens_per_second': '1977'} +{'loss': '1.184', 'grad_norm': '2.418', 'learning_rate': '4.967e-05', 'epoch': '0.09938', 'num_input_tokens_seen': 8079509, 'train_runtime': '4086', 'train_tokens_per_second': '1977'} +{'loss': '1.747', 'grad_norm': '3.212', 'learning_rate': '4.969e-05', 'epoch': '0.0994', 'num_input_tokens_seen': 8081556, 'train_runtime': '4087', 'train_tokens_per_second': '1977'} +{'loss': '1.88', 'grad_norm': '2.711', 'learning_rate': '4.97e-05', 'epoch': '0.09943', 'num_input_tokens_seen': 8083603, 'train_runtime': '4088', 'train_tokens_per_second': '1977'} +{'loss': '0.7632', 'grad_norm': '1.626', 'learning_rate': '4.971e-05', 'epoch': '0.09945', 'num_input_tokens_seen': 8085650, 'train_runtime': '4089', 'train_tokens_per_second': '1977'} +{'loss': '0.7107', 'grad_norm': '1.932', 'learning_rate': '4.972e-05', 'epoch': '0.09948', 'num_input_tokens_seen': 8087697, 'train_runtime': '4090', 'train_tokens_per_second': '1977'} +{'loss': '1.151', 'grad_norm': '2.146', 'learning_rate': '4.974e-05', 'epoch': '0.0995', 'num_input_tokens_seen': 8089744, 'train_runtime': '4091', 'train_tokens_per_second': '1977'} +{'loss': '0.8925', 'grad_norm': '1.822', 'learning_rate': '4.975e-05', 'epoch': '0.09953', 'num_input_tokens_seen': 8091791, 'train_runtime': '4092', 'train_tokens_per_second': '1977'} +{'loss': '1.791', 'grad_norm': '2.867', 'learning_rate': '4.976e-05', 'epoch': '0.09955', 'num_input_tokens_seen': 8093838, 'train_runtime': '4093', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.516', 'learning_rate': '4.977e-05', 'epoch': '0.09958', 'num_input_tokens_seen': 8095885, 'train_runtime': '4094', 'train_tokens_per_second': '1977'} +{'loss': '1.533', 'grad_norm': '2.491', 'learning_rate': '4.979e-05', 'epoch': '0.0996', 'num_input_tokens_seen': 8097932, 'train_runtime': '4095', 'train_tokens_per_second': '1977'} +{'loss': '0.4495', 'grad_norm': '1.367', 'learning_rate': '4.98e-05', 'epoch': '0.09963', 'num_input_tokens_seen': 8099979, 'train_runtime': '4096', 'train_tokens_per_second': '1977'} +{'loss': '1.43', 'grad_norm': '2.077', 'learning_rate': '4.981e-05', 'epoch': '0.09966', 'num_input_tokens_seen': 8102026, 'train_runtime': '4097', 'train_tokens_per_second': '1977'} +{'loss': '2.537', 'grad_norm': '3.39', 'learning_rate': '4.982e-05', 'epoch': '0.09968', 'num_input_tokens_seen': 8104073, 'train_runtime': '4099', 'train_tokens_per_second': '1977'} +{'loss': '1.448', 'grad_norm': '2.386', 'learning_rate': '4.984e-05', 'epoch': '0.09971', 'num_input_tokens_seen': 8106120, 'train_runtime': '4100', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '1.957', 'learning_rate': '4.985e-05', 'epoch': '0.09973', 'num_input_tokens_seen': 8108167, 'train_runtime': '4101', 'train_tokens_per_second': '1977'} +{'loss': '1.472', 'grad_norm': '2.084', 'learning_rate': '4.986e-05', 'epoch': '0.09976', 'num_input_tokens_seen': 8110214, 'train_runtime': '4102', 'train_tokens_per_second': '1977'} +{'loss': '1.621', 'grad_norm': '2.976', 'learning_rate': '4.987e-05', 'epoch': '0.09978', 'num_input_tokens_seen': 8112261, 'train_runtime': '4103', 'train_tokens_per_second': '1977'} +{'loss': '0.2675', 'grad_norm': '1.346', 'learning_rate': '4.989e-05', 'epoch': '0.09981', 'num_input_tokens_seen': 8114308, 'train_runtime': '4104', 'train_tokens_per_second': '1977'} +{'loss': '0.8947', 'grad_norm': '2.067', 'learning_rate': '4.99e-05', 'epoch': '0.09983', 'num_input_tokens_seen': 8116355, 'train_runtime': '4105', 'train_tokens_per_second': '1977'} +{'loss': '0.9834', 'grad_norm': '2.057', 'learning_rate': '4.991e-05', 'epoch': '0.09986', 'num_input_tokens_seen': 8118402, 'train_runtime': '4106', 'train_tokens_per_second': '1977'} +{'loss': '0.9861', 'grad_norm': '1.871', 'learning_rate': '4.992e-05', 'epoch': '0.09988', 'num_input_tokens_seen': 8120449, 'train_runtime': '4107', 'train_tokens_per_second': '1977'} +{'loss': '0.2783', 'grad_norm': '1.106', 'learning_rate': '4.994e-05', 'epoch': '0.09991', 'num_input_tokens_seen': 8122496, 'train_runtime': '4108', 'train_tokens_per_second': '1977'} +{'loss': '0.6191', 'grad_norm': '1.505', 'learning_rate': '4.995e-05', 'epoch': '0.09993', 'num_input_tokens_seen': 8124543, 'train_runtime': '4109', 'train_tokens_per_second': '1977'} +{'loss': '0.6231', 'grad_norm': '1.447', 'learning_rate': '4.996e-05', 'epoch': '0.09996', 'num_input_tokens_seen': 8126590, 'train_runtime': '4110', 'train_tokens_per_second': '1977'} +{'loss': '0.4607', 'grad_norm': '1.816', 'learning_rate': '4.997e-05', 'epoch': '0.09998', 'num_input_tokens_seen': 8128637, 'train_runtime': '4111', 'train_tokens_per_second': '1977'} +{'loss': '0.7402', 'grad_norm': '1.912', 'learning_rate': '4.999e-05', 'epoch': '0.1', 'num_input_tokens_seen': 8130684, 'train_runtime': '4112', 'train_tokens_per_second': '1977'} +{'loss': '0.9338', 'grad_norm': '2.655', 'learning_rate': '5e-05', 'epoch': '0.1', 'num_input_tokens_seen': 8132731, 'train_runtime': '4113', 'train_tokens_per_second': '1977'} +{'loss': '2.215', 'grad_norm': '2.989', 'learning_rate': '5e-05', 'epoch': '0.1001', 'num_input_tokens_seen': 8134778, 'train_runtime': '4114', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '2.361', 'learning_rate': '5e-05', 'epoch': '0.1001', 'num_input_tokens_seen': 8136825, 'train_runtime': '4115', 'train_tokens_per_second': '1977'} +{'loss': '1.185', 'grad_norm': '3.145', 'learning_rate': '5e-05', 'epoch': '0.1001', 'num_input_tokens_seen': 8138872, 'train_runtime': '4116', 'train_tokens_per_second': '1977'} +{'loss': '0.3219', 'grad_norm': '1.188', 'learning_rate': '5e-05', 'epoch': '0.1001', 'num_input_tokens_seen': 8140919, 'train_runtime': '4117', 'train_tokens_per_second': '1977'} +{'loss': '0.6062', 'grad_norm': '1.499', 'learning_rate': '5e-05', 'epoch': '0.1002', 'num_input_tokens_seen': 8142966, 'train_runtime': '4118', 'train_tokens_per_second': '1977'} +{'loss': '0.5633', 'grad_norm': '1.578', 'learning_rate': '5e-05', 'epoch': '0.1002', 'num_input_tokens_seen': 8145013, 'train_runtime': '4119', 'train_tokens_per_second': '1977'} +{'loss': '0.7435', 'grad_norm': '1.783', 'learning_rate': '5e-05', 'epoch': '0.1002', 'num_input_tokens_seen': 8147060, 'train_runtime': '4120', 'train_tokens_per_second': '1977'} +{'loss': '0.6912', 'grad_norm': '1.583', 'learning_rate': '5e-05', 'epoch': '0.1002', 'num_input_tokens_seen': 8149107, 'train_runtime': '4121', 'train_tokens_per_second': '1977'} +{'loss': '2.678', 'grad_norm': '2.892', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 8151154, 'train_runtime': '4122', 'train_tokens_per_second': '1977'} +{'loss': '0.6485', 'grad_norm': '1.404', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 8153201, 'train_runtime': '4123', 'train_tokens_per_second': '1977'} +{'loss': '0.7551', 'grad_norm': '1.443', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 8155248, 'train_runtime': '4124', 'train_tokens_per_second': '1977'} +{'loss': '0.4821', 'grad_norm': '1.506', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 8157295, 'train_runtime': '4125', 'train_tokens_per_second': '1977'} +{'loss': '0.9071', 'grad_norm': '1.515', 'learning_rate': '5e-05', 'epoch': '0.1004', 'num_input_tokens_seen': 8159342, 'train_runtime': '4126', 'train_tokens_per_second': '1977'} +{'loss': '0.3944', 'grad_norm': '1.129', 'learning_rate': '5e-05', 'epoch': '0.1004', 'num_input_tokens_seen': 8161389, 'train_runtime': '4127', 'train_tokens_per_second': '1977'} +{'loss': '0.5245', 'grad_norm': '1.472', 'learning_rate': '5e-05', 'epoch': '0.1004', 'num_input_tokens_seen': 8163436, 'train_runtime': '4129', 'train_tokens_per_second': '1977'} +{'loss': '0.6448', 'grad_norm': '1.499', 'learning_rate': '5e-05', 'epoch': '0.1004', 'num_input_tokens_seen': 8165483, 'train_runtime': '4130', 'train_tokens_per_second': '1977'} +{'loss': '0.6491', 'grad_norm': '1.599', 'learning_rate': '5e-05', 'epoch': '0.1005', 'num_input_tokens_seen': 8167530, 'train_runtime': '4131', 'train_tokens_per_second': '1977'} +{'loss': '1.418', 'grad_norm': '2.198', 'learning_rate': '5e-05', 'epoch': '0.1005', 'num_input_tokens_seen': 8169577, 'train_runtime': '4132', 'train_tokens_per_second': '1977'} +{'loss': '1.491', 'grad_norm': '2.121', 'learning_rate': '5e-05', 'epoch': '0.1005', 'num_input_tokens_seen': 8171624, 'train_runtime': '4133', 'train_tokens_per_second': '1977'} +{'loss': '0.8098', 'grad_norm': '1.918', 'learning_rate': '5e-05', 'epoch': '0.1005', 'num_input_tokens_seen': 8173671, 'train_runtime': '4134', 'train_tokens_per_second': '1977'} +{'loss': '2.503', 'grad_norm': '3.448', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 8175718, 'train_runtime': '4135', 'train_tokens_per_second': '1977'} +{'loss': '0.776', 'grad_norm': '1.893', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 8177765, 'train_runtime': '4136', 'train_tokens_per_second': '1977'} +{'loss': '0.9358', 'grad_norm': '1.867', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 8179812, 'train_runtime': '4137', 'train_tokens_per_second': '1977'} +{'loss': '0.5411', 'grad_norm': '1.364', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 8181859, 'train_runtime': '4138', 'train_tokens_per_second': '1977'} +{'loss': '2.254', 'grad_norm': '3.565', 'learning_rate': '5e-05', 'epoch': '0.1007', 'num_input_tokens_seen': 8183906, 'train_runtime': '4139', 'train_tokens_per_second': '1977'} +{'loss': '0.7435', 'grad_norm': '1.873', 'learning_rate': '5e-05', 'epoch': '0.1007', 'num_input_tokens_seen': 8185953, 'train_runtime': '4140', 'train_tokens_per_second': '1977'} +{'loss': '0.7215', 'grad_norm': '1.421', 'learning_rate': '5e-05', 'epoch': '0.1007', 'num_input_tokens_seen': 8188000, 'train_runtime': '4141', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 03:46:26,444 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 03:46:26,444 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 03:46:26,841 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-4000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 03:46:26,854 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-4000/tokenizer_config.json + +{'loss': '0.3804', 'grad_norm': '1.392', 'learning_rate': '5e-05', 'epoch': '0.1007', 'num_input_tokens_seen': 8190047, 'train_runtime': '4143', 'train_tokens_per_second': '1977'} +{'loss': '1.497', 'grad_norm': '2.579', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 8192094, 'train_runtime': '4144', 'train_tokens_per_second': '1977'} +{'loss': '0.8908', 'grad_norm': '2.078', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 8194141, 'train_runtime': '4145', 'train_tokens_per_second': '1977'} +{'loss': '0.5332', 'grad_norm': '1.841', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 8196188, 'train_runtime': '4146', 'train_tokens_per_second': '1977'} +{'loss': '0.7366', 'grad_norm': '1.877', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 8198235, 'train_runtime': '4147', 'train_tokens_per_second': '1977'} +{'loss': '0.4822', 'grad_norm': '1.49', 'learning_rate': '5e-05', 'epoch': '0.1009', 'num_input_tokens_seen': 8200282, 'train_runtime': '4148', 'train_tokens_per_second': '1977'} +{'loss': '0.3747', 'grad_norm': '1.559', 'learning_rate': '5e-05', 'epoch': '0.1009', 'num_input_tokens_seen': 8202329, 'train_runtime': '4149', 'train_tokens_per_second': '1977'} +{'loss': '1', 'grad_norm': '1.802', 'learning_rate': '5e-05', 'epoch': '0.1009', 'num_input_tokens_seen': 8204376, 'train_runtime': '4150', 'train_tokens_per_second': '1977'} +{'loss': '0.4533', 'grad_norm': '1.712', 'learning_rate': '5e-05', 'epoch': '0.1009', 'num_input_tokens_seen': 8206423, 'train_runtime': '4151', 'train_tokens_per_second': '1977'} +{'loss': '1.094', 'grad_norm': '1.975', 'learning_rate': '5e-05', 'epoch': '0.101', 'num_input_tokens_seen': 8208470, 'train_runtime': '4152', 'train_tokens_per_second': '1977'} +{'loss': '1.291', 'grad_norm': '2.653', 'learning_rate': '5e-05', 'epoch': '0.101', 'num_input_tokens_seen': 8210517, 'train_runtime': '4153', 'train_tokens_per_second': '1977'} +{'loss': '0.5014', 'grad_norm': '1.341', 'learning_rate': '5e-05', 'epoch': '0.101', 'num_input_tokens_seen': 8212564, 'train_runtime': '4154', 'train_tokens_per_second': '1977'} +{'loss': '0.6187', 'grad_norm': '1.423', 'learning_rate': '5e-05', 'epoch': '0.101', 'num_input_tokens_seen': 8214611, 'train_runtime': '4155', 'train_tokens_per_second': '1977'} +{'loss': '2.675', 'grad_norm': '2.325', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 8216658, 'train_runtime': '4156', 'train_tokens_per_second': '1977'} +{'loss': '1.248', 'grad_norm': '2.296', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 8218705, 'train_runtime': '4157', 'train_tokens_per_second': '1977'} +{'loss': '0.6933', 'grad_norm': '1.593', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 8220752, 'train_runtime': '4158', 'train_tokens_per_second': '1977'} +{'loss': '1.08', 'grad_norm': '2.02', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 8222799, 'train_runtime': '4159', 'train_tokens_per_second': '1977'} +{'loss': '0.4334', 'grad_norm': '1.313', 'learning_rate': '5e-05', 'epoch': '0.1012', 'num_input_tokens_seen': 8224846, 'train_runtime': '4160', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '2.082', 'learning_rate': '5e-05', 'epoch': '0.1012', 'num_input_tokens_seen': 8226893, 'train_runtime': '4161', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '2.019', 'learning_rate': '5e-05', 'epoch': '0.1012', 'num_input_tokens_seen': 8228940, 'train_runtime': '4162', 'train_tokens_per_second': '1977'} +{'loss': '0.6646', 'grad_norm': '1.315', 'learning_rate': '5e-05', 'epoch': '0.1012', 'num_input_tokens_seen': 8230987, 'train_runtime': '4163', 'train_tokens_per_second': '1977'} +{'loss': '0.3779', 'grad_norm': '1.215', 'learning_rate': '5e-05', 'epoch': '0.1013', 'num_input_tokens_seen': 8233034, 'train_runtime': '4164', 'train_tokens_per_second': '1977'} +{'loss': '1.163', 'grad_norm': '2.097', 'learning_rate': '5e-05', 'epoch': '0.1013', 'num_input_tokens_seen': 8235081, 'train_runtime': '4165', 'train_tokens_per_second': '1977'} +{'loss': '1.489', 'grad_norm': '2.92', 'learning_rate': '5e-05', 'epoch': '0.1013', 'num_input_tokens_seen': 8237128, 'train_runtime': '4166', 'train_tokens_per_second': '1977'} +{'loss': '0.9391', 'grad_norm': '1.611', 'learning_rate': '5e-05', 'epoch': '0.1013', 'num_input_tokens_seen': 8239175, 'train_runtime': '4167', 'train_tokens_per_second': '1977'} +{'loss': '0.9204', 'grad_norm': '2.122', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 8241222, 'train_runtime': '4168', 'train_tokens_per_second': '1977'} +{'loss': '0.8449', 'grad_norm': '2.067', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 8243269, 'train_runtime': '4170', 'train_tokens_per_second': '1977'} +{'loss': '0.6875', 'grad_norm': '1.314', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 8245316, 'train_runtime': '4171', 'train_tokens_per_second': '1977'} +{'loss': '0.4305', 'grad_norm': '1.278', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 8247363, 'train_runtime': '4172', 'train_tokens_per_second': '1977'} +{'loss': '0.3951', 'grad_norm': '2.119', 'learning_rate': '5e-05', 'epoch': '0.1015', 'num_input_tokens_seen': 8249410, 'train_runtime': '4173', 'train_tokens_per_second': '1977'} +{'loss': '1.385', 'grad_norm': '2.529', 'learning_rate': '5e-05', 'epoch': '0.1015', 'num_input_tokens_seen': 8251457, 'train_runtime': '4174', 'train_tokens_per_second': '1977'} +{'loss': '1.027', 'grad_norm': '2.524', 'learning_rate': '5e-05', 'epoch': '0.1015', 'num_input_tokens_seen': 8253504, 'train_runtime': '4175', 'train_tokens_per_second': '1977'} +{'loss': '1.369', 'grad_norm': '2.811', 'learning_rate': '5e-05', 'epoch': '0.1015', 'num_input_tokens_seen': 8255551, 'train_runtime': '4176', 'train_tokens_per_second': '1977'} +{'loss': '0.4891', 'grad_norm': '1.697', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 8257598, 'train_runtime': '4177', 'train_tokens_per_second': '1977'} +{'loss': '1.159', 'grad_norm': '2.25', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 8259645, 'train_runtime': '4178', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '1.964', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 8261692, 'train_runtime': '4179', 'train_tokens_per_second': '1977'} +{'loss': '1.012', 'grad_norm': '2.37', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 8263739, 'train_runtime': '4180', 'train_tokens_per_second': '1977'} +{'loss': '0.4106', 'grad_norm': '1.329', 'learning_rate': '5e-05', 'epoch': '0.1017', 'num_input_tokens_seen': 8265786, 'train_runtime': '4181', 'train_tokens_per_second': '1977'} +{'loss': '0.9481', 'grad_norm': '2.511', 'learning_rate': '5e-05', 'epoch': '0.1017', 'num_input_tokens_seen': 8267833, 'train_runtime': '4182', 'train_tokens_per_second': '1977'} +{'loss': '0.6367', 'grad_norm': '1.321', 'learning_rate': '5e-05', 'epoch': '0.1017', 'num_input_tokens_seen': 8269880, 'train_runtime': '4183', 'train_tokens_per_second': '1977'} +{'loss': '1.117', 'grad_norm': '1.98', 'learning_rate': '5e-05', 'epoch': '0.1017', 'num_input_tokens_seen': 8271927, 'train_runtime': '4184', 'train_tokens_per_second': '1977'} +{'loss': '0.5965', 'grad_norm': '1.569', 'learning_rate': '5e-05', 'epoch': '0.1018', 'num_input_tokens_seen': 8273974, 'train_runtime': '4185', 'train_tokens_per_second': '1977'} +{'loss': '1.759', 'grad_norm': '2.67', 'learning_rate': '5e-05', 'epoch': '0.1018', 'num_input_tokens_seen': 8276021, 'train_runtime': '4186', 'train_tokens_per_second': '1977'} +{'loss': '0.5295', 'grad_norm': '1.294', 'learning_rate': '5e-05', 'epoch': '0.1018', 'num_input_tokens_seen': 8278068, 'train_runtime': '4187', 'train_tokens_per_second': '1977'} +{'loss': '1.311', 'grad_norm': '2.56', 'learning_rate': '5e-05', 'epoch': '0.1018', 'num_input_tokens_seen': 8280115, 'train_runtime': '4188', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '2.032', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 8282162, 'train_runtime': '4189', 'train_tokens_per_second': '1977'} +{'loss': '0.8606', 'grad_norm': '1.916', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 8284209, 'train_runtime': '4190', 'train_tokens_per_second': '1977'} +{'loss': '0.3916', 'grad_norm': '1.464', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 8286256, 'train_runtime': '4191', 'train_tokens_per_second': '1977'} +{'loss': '0.7208', 'grad_norm': '1.375', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 8288303, 'train_runtime': '4192', 'train_tokens_per_second': '1977'} +{'loss': '0.696', 'grad_norm': '1.71', 'learning_rate': '5e-05', 'epoch': '0.102', 'num_input_tokens_seen': 8290350, 'train_runtime': '4193', 'train_tokens_per_second': '1977'} +{'loss': '1.707', 'grad_norm': '2.155', 'learning_rate': '5e-05', 'epoch': '0.102', 'num_input_tokens_seen': 8292397, 'train_runtime': '4194', 'train_tokens_per_second': '1977'} +{'loss': '1.583', 'grad_norm': '2.406', 'learning_rate': '5e-05', 'epoch': '0.102', 'num_input_tokens_seen': 8294444, 'train_runtime': '4195', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '2.482', 'learning_rate': '5e-05', 'epoch': '0.102', 'num_input_tokens_seen': 8296491, 'train_runtime': '4196', 'train_tokens_per_second': '1977'} +{'loss': '3.252', 'grad_norm': '2.354', 'learning_rate': '5e-05', 'epoch': '0.1021', 'num_input_tokens_seen': 8298538, 'train_runtime': '4197', 'train_tokens_per_second': '1977'} +{'loss': '0.7518', 'grad_norm': '1.715', 'learning_rate': '5e-05', 'epoch': '0.1021', 'num_input_tokens_seen': 8300585, 'train_runtime': '4198', 'train_tokens_per_second': '1977'} +{'loss': '0.4889', 'grad_norm': '1.902', 'learning_rate': '5e-05', 'epoch': '0.1021', 'num_input_tokens_seen': 8302632, 'train_runtime': '4200', 'train_tokens_per_second': '1977'} +{'loss': '1.728', 'grad_norm': '2.361', 'learning_rate': '5e-05', 'epoch': '0.1021', 'num_input_tokens_seen': 8304679, 'train_runtime': '4201', 'train_tokens_per_second': '1977'} +{'loss': '2.096', 'grad_norm': '2.925', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 8306726, 'train_runtime': '4202', 'train_tokens_per_second': '1977'} +{'loss': '0.4381', 'grad_norm': '1.306', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 8308773, 'train_runtime': '4203', 'train_tokens_per_second': '1977'} +{'loss': '1.393', 'grad_norm': '2.045', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 8310820, 'train_runtime': '4204', 'train_tokens_per_second': '1977'} +{'loss': '1.449', 'grad_norm': '2.731', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 8312867, 'train_runtime': '4205', 'train_tokens_per_second': '1977'} +{'loss': '0.6061', 'grad_norm': '2.194', 'learning_rate': '5e-05', 'epoch': '0.1023', 'num_input_tokens_seen': 8314914, 'train_runtime': '4206', 'train_tokens_per_second': '1977'} +{'loss': '0.4499', 'grad_norm': '1.642', 'learning_rate': '5e-05', 'epoch': '0.1023', 'num_input_tokens_seen': 8316961, 'train_runtime': '4207', 'train_tokens_per_second': '1977'} +{'loss': '0.3787', 'grad_norm': '1.546', 'learning_rate': '5e-05', 'epoch': '0.1023', 'num_input_tokens_seen': 8319008, 'train_runtime': '4208', 'train_tokens_per_second': '1977'} +{'loss': '2.118', 'grad_norm': '2.78', 'learning_rate': '5e-05', 'epoch': '0.1023', 'num_input_tokens_seen': 8321055, 'train_runtime': '4209', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.214', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 8323102, 'train_runtime': '4210', 'train_tokens_per_second': '1977'} +{'loss': '0.3592', 'grad_norm': '1.381', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 8325149, 'train_runtime': '4211', 'train_tokens_per_second': '1977'} +{'loss': '0.604', 'grad_norm': '1.54', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 8327196, 'train_runtime': '4212', 'train_tokens_per_second': '1977'} +{'loss': '1.804', 'grad_norm': '2.395', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 8329243, 'train_runtime': '4213', 'train_tokens_per_second': '1977'} +{'loss': '1.214', 'grad_norm': '2.042', 'learning_rate': '5e-05', 'epoch': '0.1025', 'num_input_tokens_seen': 8331290, 'train_runtime': '4214', 'train_tokens_per_second': '1977'} +{'loss': '1.516', 'grad_norm': '2.298', 'learning_rate': '5e-05', 'epoch': '0.1025', 'num_input_tokens_seen': 8333337, 'train_runtime': '4215', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '3.211', 'learning_rate': '5e-05', 'epoch': '0.1025', 'num_input_tokens_seen': 8335384, 'train_runtime': '4216', 'train_tokens_per_second': '1977'} +{'loss': '0.4272', 'grad_norm': '1.302', 'learning_rate': '5e-05', 'epoch': '0.1026', 'num_input_tokens_seen': 8337431, 'train_runtime': '4217', 'train_tokens_per_second': '1977'} +{'loss': '0.8785', 'grad_norm': '2.044', 'learning_rate': '5e-05', 'epoch': '0.1026', 'num_input_tokens_seen': 8339478, 'train_runtime': '4218', 'train_tokens_per_second': '1977'} +{'loss': '0.3725', 'grad_norm': '1.306', 'learning_rate': '5e-05', 'epoch': '0.1026', 'num_input_tokens_seen': 8341525, 'train_runtime': '4219', 'train_tokens_per_second': '1977'} +{'loss': '0.8738', 'grad_norm': '1.379', 'learning_rate': '5e-05', 'epoch': '0.1026', 'num_input_tokens_seen': 8343572, 'train_runtime': '4220', 'train_tokens_per_second': '1977'} +{'loss': '0.7119', 'grad_norm': '1.747', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 8345619, 'train_runtime': '4221', 'train_tokens_per_second': '1977'} +{'loss': '1.454', 'grad_norm': '2.302', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 8347666, 'train_runtime': '4222', 'train_tokens_per_second': '1977'} +{'loss': '0.4146', 'grad_norm': '1.34', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 8349713, 'train_runtime': '4223', 'train_tokens_per_second': '1977'} +{'loss': '0.8764', 'grad_norm': '1.782', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 8351760, 'train_runtime': '4224', 'train_tokens_per_second': '1977'} +{'loss': '0.7851', 'grad_norm': '2.071', 'learning_rate': '5e-05', 'epoch': '0.1028', 'num_input_tokens_seen': 8353807, 'train_runtime': '4225', 'train_tokens_per_second': '1977'} +{'loss': '0.5306', 'grad_norm': '1.851', 'learning_rate': '5e-05', 'epoch': '0.1028', 'num_input_tokens_seen': 8355854, 'train_runtime': '4226', 'train_tokens_per_second': '1977'} +{'loss': '0.7672', 'grad_norm': '2.119', 'learning_rate': '5e-05', 'epoch': '0.1028', 'num_input_tokens_seen': 8357901, 'train_runtime': '4227', 'train_tokens_per_second': '1977'} +{'loss': '0.6009', 'grad_norm': '1.516', 'learning_rate': '5e-05', 'epoch': '0.1028', 'num_input_tokens_seen': 8359948, 'train_runtime': '4228', 'train_tokens_per_second': '1977'} +{'loss': '0.4387', 'grad_norm': '1.532', 'learning_rate': '5e-05', 'epoch': '0.1029', 'num_input_tokens_seen': 8361995, 'train_runtime': '4229', 'train_tokens_per_second': '1977'} +{'loss': '1.105', 'grad_norm': '2.058', 'learning_rate': '5e-05', 'epoch': '0.1029', 'num_input_tokens_seen': 8364042, 'train_runtime': '4230', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '1.729', 'learning_rate': '5e-05', 'epoch': '0.1029', 'num_input_tokens_seen': 8366089, 'train_runtime': '4232', 'train_tokens_per_second': '1977'} +{'loss': '0.6162', 'grad_norm': '1.599', 'learning_rate': '5e-05', 'epoch': '0.1029', 'num_input_tokens_seen': 8368136, 'train_runtime': '4233', 'train_tokens_per_second': '1977'} +{'loss': '1.124', 'grad_norm': '2.247', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 8370183, 'train_runtime': '4234', 'train_tokens_per_second': '1977'} +{'loss': '1.592', 'grad_norm': '2.537', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 8372230, 'train_runtime': '4235', 'train_tokens_per_second': '1977'} +{'loss': '1.19', 'grad_norm': '1.895', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 8374277, 'train_runtime': '4236', 'train_tokens_per_second': '1977'} +{'loss': '0.4124', 'grad_norm': '1.208', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 8376324, 'train_runtime': '4237', 'train_tokens_per_second': '1977'} +{'loss': '2.379', 'grad_norm': '3.248', 'learning_rate': '5e-05', 'epoch': '0.1031', 'num_input_tokens_seen': 8378371, 'train_runtime': '4238', 'train_tokens_per_second': '1977'} +{'loss': '0.8097', 'grad_norm': '1.907', 'learning_rate': '5e-05', 'epoch': '0.1031', 'num_input_tokens_seen': 8380418, 'train_runtime': '4239', 'train_tokens_per_second': '1977'} +{'loss': '0.4984', 'grad_norm': '1.567', 'learning_rate': '5e-05', 'epoch': '0.1031', 'num_input_tokens_seen': 8382465, 'train_runtime': '4240', 'train_tokens_per_second': '1977'} +{'loss': '0.3691', 'grad_norm': '1.559', 'learning_rate': '5e-05', 'epoch': '0.1031', 'num_input_tokens_seen': 8384512, 'train_runtime': '4241', 'train_tokens_per_second': '1977'} +{'loss': '0.4896', 'grad_norm': '1.388', 'learning_rate': '5e-05', 'epoch': '0.1032', 'num_input_tokens_seen': 8386559, 'train_runtime': '4242', 'train_tokens_per_second': '1977'} +{'loss': '0.4325', 'grad_norm': '1.286', 'learning_rate': '5e-05', 'epoch': '0.1032', 'num_input_tokens_seen': 8388606, 'train_runtime': '4243', 'train_tokens_per_second': '1977'} +{'loss': '0.438', 'grad_norm': '1.239', 'learning_rate': '5e-05', 'epoch': '0.1032', 'num_input_tokens_seen': 8390653, 'train_runtime': '4244', 'train_tokens_per_second': '1977'} +{'loss': '1.223', 'grad_norm': '2.306', 'learning_rate': '5e-05', 'epoch': '0.1032', 'num_input_tokens_seen': 8392700, 'train_runtime': '4245', 'train_tokens_per_second': '1977'} +{'loss': '0.8418', 'grad_norm': '1.742', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 8394747, 'train_runtime': '4246', 'train_tokens_per_second': '1977'} +{'loss': '0.4103', 'grad_norm': '1.287', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 8396794, 'train_runtime': '4247', 'train_tokens_per_second': '1977'} +{'loss': '0.6156', 'grad_norm': '1.586', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 8398841, 'train_runtime': '4248', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '1.975', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 8400888, 'train_runtime': '4249', 'train_tokens_per_second': '1977'} +{'loss': '0.6868', 'grad_norm': '1.501', 'learning_rate': '5e-05', 'epoch': '0.1034', 'num_input_tokens_seen': 8402935, 'train_runtime': '4250', 'train_tokens_per_second': '1977'} +{'loss': '1.652', 'grad_norm': '2.754', 'learning_rate': '5e-05', 'epoch': '0.1034', 'num_input_tokens_seen': 8404982, 'train_runtime': '4251', 'train_tokens_per_second': '1977'} +{'loss': '1.218', 'grad_norm': '2.095', 'learning_rate': '5e-05', 'epoch': '0.1034', 'num_input_tokens_seen': 8407029, 'train_runtime': '4252', 'train_tokens_per_second': '1977'} +{'loss': '0.6564', 'grad_norm': '1.735', 'learning_rate': '5e-05', 'epoch': '0.1034', 'num_input_tokens_seen': 8409076, 'train_runtime': '4253', 'train_tokens_per_second': '1977'} +{'loss': '0.4751', 'grad_norm': '1.326', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 8411123, 'train_runtime': '4254', 'train_tokens_per_second': '1977'} +{'loss': '0.5003', 'grad_norm': '1.359', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 8413170, 'train_runtime': '4255', 'train_tokens_per_second': '1977'} +{'loss': '0.3878', 'grad_norm': '1.094', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 8415217, 'train_runtime': '4256', 'train_tokens_per_second': '1977'} +{'loss': '1.398', 'grad_norm': '2.212', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 8417264, 'train_runtime': '4257', 'train_tokens_per_second': '1977'} +{'loss': '0.4286', 'grad_norm': '1.148', 'learning_rate': '5e-05', 'epoch': '0.1036', 'num_input_tokens_seen': 8419311, 'train_runtime': '4258', 'train_tokens_per_second': '1977'} +{'loss': '0.4909', 'grad_norm': '1.434', 'learning_rate': '5e-05', 'epoch': '0.1036', 'num_input_tokens_seen': 8421358, 'train_runtime': '4259', 'train_tokens_per_second': '1977'} +{'loss': '0.7262', 'grad_norm': '1.612', 'learning_rate': '5e-05', 'epoch': '0.1036', 'num_input_tokens_seen': 8423405, 'train_runtime': '4260', 'train_tokens_per_second': '1977'} +{'loss': '0.7923', 'grad_norm': '1.381', 'learning_rate': '5e-05', 'epoch': '0.1036', 'num_input_tokens_seen': 8425452, 'train_runtime': '4261', 'train_tokens_per_second': '1977'} +{'loss': '0.4127', 'grad_norm': '1.11', 'learning_rate': '5e-05', 'epoch': '0.1037', 'num_input_tokens_seen': 8427499, 'train_runtime': '4263', 'train_tokens_per_second': '1977'} +{'loss': '0.8301', 'grad_norm': '2.111', 'learning_rate': '5e-05', 'epoch': '0.1037', 'num_input_tokens_seen': 8429546, 'train_runtime': '4264', 'train_tokens_per_second': '1977'} +{'loss': '0.951', 'grad_norm': '1.88', 'learning_rate': '5e-05', 'epoch': '0.1037', 'num_input_tokens_seen': 8431593, 'train_runtime': '4265', 'train_tokens_per_second': '1977'} +{'loss': '1.575', 'grad_norm': '3.028', 'learning_rate': '5e-05', 'epoch': '0.1037', 'num_input_tokens_seen': 8433640, 'train_runtime': '4266', 'train_tokens_per_second': '1977'} +{'loss': '1.528', 'grad_norm': '2.435', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 8435687, 'train_runtime': '4267', 'train_tokens_per_second': '1977'} +{'loss': '0.7611', 'grad_norm': '1.901', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 8437734, 'train_runtime': '4268', 'train_tokens_per_second': '1977'} +{'loss': '0.6814', 'grad_norm': '1.644', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 8439781, 'train_runtime': '4269', 'train_tokens_per_second': '1977'} +{'loss': '0.3583', 'grad_norm': '1.4', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 8441828, 'train_runtime': '4270', 'train_tokens_per_second': '1977'} +{'loss': '0.4709', 'grad_norm': '1.475', 'learning_rate': '5e-05', 'epoch': '0.1039', 'num_input_tokens_seen': 8443875, 'train_runtime': '4271', 'train_tokens_per_second': '1977'} +{'loss': '0.5488', 'grad_norm': '1.738', 'learning_rate': '5e-05', 'epoch': '0.1039', 'num_input_tokens_seen': 8445922, 'train_runtime': '4272', 'train_tokens_per_second': '1977'} +{'loss': '0.9384', 'grad_norm': '1.937', 'learning_rate': '5e-05', 'epoch': '0.1039', 'num_input_tokens_seen': 8447969, 'train_runtime': '4273', 'train_tokens_per_second': '1977'} +{'loss': '2.26', 'grad_norm': '2.661', 'learning_rate': '5e-05', 'epoch': '0.1039', 'num_input_tokens_seen': 8450016, 'train_runtime': '4274', 'train_tokens_per_second': '1977'} +{'loss': '1.125', 'grad_norm': '2.093', 'learning_rate': '5e-05', 'epoch': '0.104', 'num_input_tokens_seen': 8452063, 'train_runtime': '4275', 'train_tokens_per_second': '1977'} +{'loss': '0.6282', 'grad_norm': '1.677', 'learning_rate': '5e-05', 'epoch': '0.104', 'num_input_tokens_seen': 8454110, 'train_runtime': '4276', 'train_tokens_per_second': '1977'} +{'loss': '1.186', 'grad_norm': '2.392', 'learning_rate': '5e-05', 'epoch': '0.104', 'num_input_tokens_seen': 8456157, 'train_runtime': '4277', 'train_tokens_per_second': '1977'} +{'loss': '0.3504', 'grad_norm': '1.326', 'learning_rate': '5e-05', 'epoch': '0.104', 'num_input_tokens_seen': 8458204, 'train_runtime': '4278', 'train_tokens_per_second': '1977'} +{'loss': '1.877', 'grad_norm': '2.731', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 8460251, 'train_runtime': '4279', 'train_tokens_per_second': '1977'} +{'loss': '1.321', 'grad_norm': '2.502', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 8462298, 'train_runtime': '4280', 'train_tokens_per_second': '1977'} +{'loss': '0.7371', 'grad_norm': '1.893', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 8464345, 'train_runtime': '4281', 'train_tokens_per_second': '1977'} +{'loss': '0.3966', 'grad_norm': '1.089', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 8466392, 'train_runtime': '4282', 'train_tokens_per_second': '1977'} +{'loss': '0.6938', 'grad_norm': '1.466', 'learning_rate': '5e-05', 'epoch': '0.1042', 'num_input_tokens_seen': 8468439, 'train_runtime': '4283', 'train_tokens_per_second': '1977'} +{'loss': '0.349', 'grad_norm': '1.403', 'learning_rate': '5e-05', 'epoch': '0.1042', 'num_input_tokens_seen': 8470486, 'train_runtime': '4284', 'train_tokens_per_second': '1977'} +{'loss': '0.8358', 'grad_norm': '1.732', 'learning_rate': '5e-05', 'epoch': '0.1042', 'num_input_tokens_seen': 8472533, 'train_runtime': '4285', 'train_tokens_per_second': '1977'} +{'loss': '2.816', 'grad_norm': '2.4', 'learning_rate': '5e-05', 'epoch': '0.1042', 'num_input_tokens_seen': 8474580, 'train_runtime': '4286', 'train_tokens_per_second': '1977'} +{'loss': '1.904', 'grad_norm': '2.608', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 8476627, 'train_runtime': '4287', 'train_tokens_per_second': '1977'} +{'loss': '1.144', 'grad_norm': '2.438', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 8478674, 'train_runtime': '4288', 'train_tokens_per_second': '1977'} +{'loss': '0.9278', 'grad_norm': '2.017', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 8480721, 'train_runtime': '4289', 'train_tokens_per_second': '1977'} +{'loss': '0.9369', 'grad_norm': '1.616', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 8482768, 'train_runtime': '4290', 'train_tokens_per_second': '1977'} +{'loss': '1.185', 'grad_norm': '2.042', 'learning_rate': '5e-05', 'epoch': '0.1044', 'num_input_tokens_seen': 8484815, 'train_runtime': '4291', 'train_tokens_per_second': '1977'} +{'loss': '1.808', 'grad_norm': '2.393', 'learning_rate': '5e-05', 'epoch': '0.1044', 'num_input_tokens_seen': 8486862, 'train_runtime': '4293', 'train_tokens_per_second': '1977'} +{'loss': '0.9226', 'grad_norm': '1.607', 'learning_rate': '5e-05', 'epoch': '0.1044', 'num_input_tokens_seen': 8488909, 'train_runtime': '4294', 'train_tokens_per_second': '1977'} +{'loss': '0.3326', 'grad_norm': '1.452', 'learning_rate': '5e-05', 'epoch': '0.1044', 'num_input_tokens_seen': 8490956, 'train_runtime': '4295', 'train_tokens_per_second': '1977'} +{'loss': '0.4898', 'grad_norm': '1.484', 'learning_rate': '5e-05', 'epoch': '0.1045', 'num_input_tokens_seen': 8493003, 'train_runtime': '4296', 'train_tokens_per_second': '1977'} +{'loss': '0.2683', 'grad_norm': '1.275', 'learning_rate': '5e-05', 'epoch': '0.1045', 'num_input_tokens_seen': 8495050, 'train_runtime': '4297', 'train_tokens_per_second': '1977'} +{'loss': '0.5585', 'grad_norm': '1.578', 'learning_rate': '5e-05', 'epoch': '0.1045', 'num_input_tokens_seen': 8497097, 'train_runtime': '4298', 'train_tokens_per_second': '1977'} +{'loss': '0.4907', 'grad_norm': '1.585', 'learning_rate': '5e-05', 'epoch': '0.1045', 'num_input_tokens_seen': 8499144, 'train_runtime': '4299', 'train_tokens_per_second': '1977'} +{'loss': '1.233', 'grad_norm': '2.11', 'learning_rate': '5e-05', 'epoch': '0.1046', 'num_input_tokens_seen': 8501191, 'train_runtime': '4300', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '2.35', 'learning_rate': '5e-05', 'epoch': '0.1046', 'num_input_tokens_seen': 8503238, 'train_runtime': '4301', 'train_tokens_per_second': '1977'} +{'loss': '0.6323', 'grad_norm': '1.854', 'learning_rate': '5e-05', 'epoch': '0.1046', 'num_input_tokens_seen': 8505285, 'train_runtime': '4302', 'train_tokens_per_second': '1977'} +{'loss': '1.019', 'grad_norm': '2.192', 'learning_rate': '5e-05', 'epoch': '0.1046', 'num_input_tokens_seen': 8507332, 'train_runtime': '4303', 'train_tokens_per_second': '1977'} +{'loss': '1.048', 'grad_norm': '1.908', 'learning_rate': '5e-05', 'epoch': '0.1047', 'num_input_tokens_seen': 8509379, 'train_runtime': '4304', 'train_tokens_per_second': '1977'} +{'loss': '0.3786', 'grad_norm': '1.509', 'learning_rate': '5e-05', 'epoch': '0.1047', 'num_input_tokens_seen': 8511426, 'train_runtime': '4305', 'train_tokens_per_second': '1977'} +{'loss': '0.4175', 'grad_norm': '1.11', 'learning_rate': '5e-05', 'epoch': '0.1047', 'num_input_tokens_seen': 8513473, 'train_runtime': '4306', 'train_tokens_per_second': '1977'} +{'loss': '1.17', 'grad_norm': '2.32', 'learning_rate': '5e-05', 'epoch': '0.1047', 'num_input_tokens_seen': 8515520, 'train_runtime': '4307', 'train_tokens_per_second': '1977'} +{'loss': '0.9021', 'grad_norm': '1.568', 'learning_rate': '5e-05', 'epoch': '0.1048', 'num_input_tokens_seen': 8517567, 'train_runtime': '4308', 'train_tokens_per_second': '1977'} +{'loss': '0.9074', 'grad_norm': '2.063', 'learning_rate': '5e-05', 'epoch': '0.1048', 'num_input_tokens_seen': 8519614, 'train_runtime': '4309', 'train_tokens_per_second': '1977'} +{'loss': '2.293', 'grad_norm': '3.139', 'learning_rate': '5e-05', 'epoch': '0.1048', 'num_input_tokens_seen': 8521661, 'train_runtime': '4310', 'train_tokens_per_second': '1977'} +{'loss': '0.3207', 'grad_norm': '1.223', 'learning_rate': '5e-05', 'epoch': '0.1048', 'num_input_tokens_seen': 8523708, 'train_runtime': '4311', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '2.185', 'learning_rate': '5e-05', 'epoch': '0.1049', 'num_input_tokens_seen': 8525755, 'train_runtime': '4312', 'train_tokens_per_second': '1977'} +{'loss': '1.127', 'grad_norm': '2.218', 'learning_rate': '5e-05', 'epoch': '0.1049', 'num_input_tokens_seen': 8527802, 'train_runtime': '4313', 'train_tokens_per_second': '1977'} +{'loss': '0.7299', 'grad_norm': '1.798', 'learning_rate': '5e-05', 'epoch': '0.1049', 'num_input_tokens_seen': 8529849, 'train_runtime': '4314', 'train_tokens_per_second': '1977'} +{'loss': '0.6723', 'grad_norm': '1.695', 'learning_rate': '5e-05', 'epoch': '0.1049', 'num_input_tokens_seen': 8531896, 'train_runtime': '4315', 'train_tokens_per_second': '1977'} +{'loss': '0.7755', 'grad_norm': '1.626', 'learning_rate': '5e-05', 'epoch': '0.105', 'num_input_tokens_seen': 8533943, 'train_runtime': '4316', 'train_tokens_per_second': '1977'} +{'loss': '0.8849', 'grad_norm': '1.681', 'learning_rate': '5e-05', 'epoch': '0.105', 'num_input_tokens_seen': 8535990, 'train_runtime': '4317', 'train_tokens_per_second': '1977'} +{'loss': '1.134', 'grad_norm': '2.067', 'learning_rate': '5e-05', 'epoch': '0.105', 'num_input_tokens_seen': 8538037, 'train_runtime': '4318', 'train_tokens_per_second': '1977'} +{'loss': '1.387', 'grad_norm': '2.099', 'learning_rate': '5e-05', 'epoch': '0.105', 'num_input_tokens_seen': 8540084, 'train_runtime': '4319', 'train_tokens_per_second': '1977'} +{'loss': '0.7939', 'grad_norm': '1.731', 'learning_rate': '5e-05', 'epoch': '0.1051', 'num_input_tokens_seen': 8542131, 'train_runtime': '4320', 'train_tokens_per_second': '1977'} +{'loss': '0.9096', 'grad_norm': '1.879', 'learning_rate': '5e-05', 'epoch': '0.1051', 'num_input_tokens_seen': 8544178, 'train_runtime': '4321', 'train_tokens_per_second': '1977'} +{'loss': '0.5915', 'grad_norm': '1.826', 'learning_rate': '5e-05', 'epoch': '0.1051', 'num_input_tokens_seen': 8546225, 'train_runtime': '4323', 'train_tokens_per_second': '1977'} +{'loss': '1.174', 'grad_norm': '1.853', 'learning_rate': '5e-05', 'epoch': '0.1051', 'num_input_tokens_seen': 8548272, 'train_runtime': '4324', 'train_tokens_per_second': '1977'} +{'loss': '0.9148', 'grad_norm': '2.015', 'learning_rate': '5e-05', 'epoch': '0.1052', 'num_input_tokens_seen': 8550319, 'train_runtime': '4325', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '2.236', 'learning_rate': '5e-05', 'epoch': '0.1052', 'num_input_tokens_seen': 8552366, 'train_runtime': '4326', 'train_tokens_per_second': '1977'} +{'loss': '0.4168', 'grad_norm': '1.426', 'learning_rate': '5e-05', 'epoch': '0.1052', 'num_input_tokens_seen': 8554413, 'train_runtime': '4327', 'train_tokens_per_second': '1977'} +{'loss': '0.3169', 'grad_norm': '1.14', 'learning_rate': '5e-05', 'epoch': '0.1052', 'num_input_tokens_seen': 8556460, 'train_runtime': '4328', 'train_tokens_per_second': '1977'} +{'loss': '0.8369', 'grad_norm': '1.866', 'learning_rate': '5e-05', 'epoch': '0.1053', 'num_input_tokens_seen': 8558507, 'train_runtime': '4329', 'train_tokens_per_second': '1977'} +{'loss': '0.3914', 'grad_norm': '1.242', 'learning_rate': '5e-05', 'epoch': '0.1053', 'num_input_tokens_seen': 8560554, 'train_runtime': '4330', 'train_tokens_per_second': '1977'} +{'loss': '0.9604', 'grad_norm': '2.187', 'learning_rate': '5e-05', 'epoch': '0.1053', 'num_input_tokens_seen': 8562601, 'train_runtime': '4331', 'train_tokens_per_second': '1977'} +{'loss': '0.4282', 'grad_norm': '1.445', 'learning_rate': '5e-05', 'epoch': '0.1053', 'num_input_tokens_seen': 8564648, 'train_runtime': '4332', 'train_tokens_per_second': '1977'} +{'loss': '1.424', 'grad_norm': '2.688', 'learning_rate': '5e-05', 'epoch': '0.1054', 'num_input_tokens_seen': 8566695, 'train_runtime': '4333', 'train_tokens_per_second': '1977'} +{'loss': '0.5979', 'grad_norm': '1.454', 'learning_rate': '5e-05', 'epoch': '0.1054', 'num_input_tokens_seen': 8568742, 'train_runtime': '4334', 'train_tokens_per_second': '1977'} +{'loss': '2.824', 'grad_norm': '2.751', 'learning_rate': '5e-05', 'epoch': '0.1054', 'num_input_tokens_seen': 8570789, 'train_runtime': '4335', 'train_tokens_per_second': '1977'} +{'loss': '0.7561', 'grad_norm': '1.474', 'learning_rate': '5e-05', 'epoch': '0.1054', 'num_input_tokens_seen': 8572836, 'train_runtime': '4336', 'train_tokens_per_second': '1977'} +{'loss': '1.441', 'grad_norm': '6.831', 'learning_rate': '5e-05', 'epoch': '0.1055', 'num_input_tokens_seen': 8574883, 'train_runtime': '4337', 'train_tokens_per_second': '1977'} +{'loss': '0.4887', 'grad_norm': '1.203', 'learning_rate': '5e-05', 'epoch': '0.1055', 'num_input_tokens_seen': 8576930, 'train_runtime': '4338', 'train_tokens_per_second': '1977'} +{'loss': '0.8375', 'grad_norm': '1.97', 'learning_rate': '5e-05', 'epoch': '0.1055', 'num_input_tokens_seen': 8578977, 'train_runtime': '4339', 'train_tokens_per_second': '1977'} +{'loss': '0.988', 'grad_norm': '1.82', 'learning_rate': '5e-05', 'epoch': '0.1055', 'num_input_tokens_seen': 8581024, 'train_runtime': '4340', 'train_tokens_per_second': '1977'} +{'loss': '1.139', 'grad_norm': '1.966', 'learning_rate': '5e-05', 'epoch': '0.1056', 'num_input_tokens_seen': 8583071, 'train_runtime': '4341', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '2.582', 'learning_rate': '5e-05', 'epoch': '0.1056', 'num_input_tokens_seen': 8585118, 'train_runtime': '4342', 'train_tokens_per_second': '1977'} +{'loss': '0.6242', 'grad_norm': '1.491', 'learning_rate': '5e-05', 'epoch': '0.1056', 'num_input_tokens_seen': 8587165, 'train_runtime': '4343', 'train_tokens_per_second': '1977'} +{'loss': '0.5769', 'grad_norm': '1.547', 'learning_rate': '5e-05', 'epoch': '0.1056', 'num_input_tokens_seen': 8589212, 'train_runtime': '4344', 'train_tokens_per_second': '1977'} +{'loss': '0.8112', 'grad_norm': '1.477', 'learning_rate': '5e-05', 'epoch': '0.1057', 'num_input_tokens_seen': 8591259, 'train_runtime': '4345', 'train_tokens_per_second': '1977'} +{'loss': '0.9498', 'grad_norm': '1.879', 'learning_rate': '5e-05', 'epoch': '0.1057', 'num_input_tokens_seen': 8593306, 'train_runtime': '4346', 'train_tokens_per_second': '1977'} +{'loss': '0.391', 'grad_norm': '1.115', 'learning_rate': '5e-05', 'epoch': '0.1057', 'num_input_tokens_seen': 8595353, 'train_runtime': '4347', 'train_tokens_per_second': '1977'} +{'loss': '1.296', 'grad_norm': '2.03', 'learning_rate': '5e-05', 'epoch': '0.1057', 'num_input_tokens_seen': 8597400, 'train_runtime': '4348', 'train_tokens_per_second': '1977'} +{'loss': '1.206', 'grad_norm': '2.23', 'learning_rate': '5e-05', 'epoch': '0.1058', 'num_input_tokens_seen': 8599447, 'train_runtime': '4349', 'train_tokens_per_second': '1977'} +{'loss': '0.6202', 'grad_norm': '1.499', 'learning_rate': '5e-05', 'epoch': '0.1058', 'num_input_tokens_seen': 8601494, 'train_runtime': '4350', 'train_tokens_per_second': '1977'} +{'loss': '0.816', 'grad_norm': '1.39', 'learning_rate': '5e-05', 'epoch': '0.1058', 'num_input_tokens_seen': 8603541, 'train_runtime': '4351', 'train_tokens_per_second': '1977'} +{'loss': '0.3806', 'grad_norm': '1.143', 'learning_rate': '5e-05', 'epoch': '0.1058', 'num_input_tokens_seen': 8605588, 'train_runtime': '4353', 'train_tokens_per_second': '1977'} +{'loss': '0.9547', 'grad_norm': '1.624', 'learning_rate': '5e-05', 'epoch': '0.1059', 'num_input_tokens_seen': 8607635, 'train_runtime': '4354', 'train_tokens_per_second': '1977'} +{'loss': '0.4143', 'grad_norm': '1.241', 'learning_rate': '5e-05', 'epoch': '0.1059', 'num_input_tokens_seen': 8609682, 'train_runtime': '4355', 'train_tokens_per_second': '1977'} +{'loss': '0.7138', 'grad_norm': '1.505', 'learning_rate': '5e-05', 'epoch': '0.1059', 'num_input_tokens_seen': 8611729, 'train_runtime': '4356', 'train_tokens_per_second': '1977'} +{'loss': '1.277', 'grad_norm': '1.988', 'learning_rate': '5e-05', 'epoch': '0.1059', 'num_input_tokens_seen': 8613776, 'train_runtime': '4357', 'train_tokens_per_second': '1977'} +{'loss': '1.15', 'grad_norm': '2.098', 'learning_rate': '5e-05', 'epoch': '0.106', 'num_input_tokens_seen': 8615823, 'train_runtime': '4358', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '1.779', 'learning_rate': '5e-05', 'epoch': '0.106', 'num_input_tokens_seen': 8617870, 'train_runtime': '4359', 'train_tokens_per_second': '1977'} +{'loss': '1.102', 'grad_norm': '2.039', 'learning_rate': '5e-05', 'epoch': '0.106', 'num_input_tokens_seen': 8619917, 'train_runtime': '4360', 'train_tokens_per_second': '1977'} +{'loss': '0.6688', 'grad_norm': '1.655', 'learning_rate': '5e-05', 'epoch': '0.1061', 'num_input_tokens_seen': 8621964, 'train_runtime': '4361', 'train_tokens_per_second': '1977'} +{'loss': '0.3974', 'grad_norm': '1.425', 'learning_rate': '5e-05', 'epoch': '0.1061', 'num_input_tokens_seen': 8624011, 'train_runtime': '4362', 'train_tokens_per_second': '1977'} +{'loss': '0.3487', 'grad_norm': '1.276', 'learning_rate': '5e-05', 'epoch': '0.1061', 'num_input_tokens_seen': 8626058, 'train_runtime': '4363', 'train_tokens_per_second': '1977'} +{'loss': '0.7704', 'grad_norm': '1.632', 'learning_rate': '5e-05', 'epoch': '0.1061', 'num_input_tokens_seen': 8628105, 'train_runtime': '4364', 'train_tokens_per_second': '1977'} +{'loss': '0.9102', 'grad_norm': '1.985', 'learning_rate': '5e-05', 'epoch': '0.1062', 'num_input_tokens_seen': 8630152, 'train_runtime': '4365', 'train_tokens_per_second': '1977'} +{'loss': '0.6762', 'grad_norm': '1.914', 'learning_rate': '5e-05', 'epoch': '0.1062', 'num_input_tokens_seen': 8632199, 'train_runtime': '4366', 'train_tokens_per_second': '1977'} +{'loss': '0.3213', 'grad_norm': '1.097', 'learning_rate': '5e-05', 'epoch': '0.1062', 'num_input_tokens_seen': 8634246, 'train_runtime': '4367', 'train_tokens_per_second': '1977'} +{'loss': '0.439', 'grad_norm': '1.274', 'learning_rate': '5e-05', 'epoch': '0.1062', 'num_input_tokens_seen': 8636293, 'train_runtime': '4368', 'train_tokens_per_second': '1977'} +{'loss': '1.468', 'grad_norm': '3.115', 'learning_rate': '5e-05', 'epoch': '0.1063', 'num_input_tokens_seen': 8638340, 'train_runtime': '4369', 'train_tokens_per_second': '1977'} +{'loss': '2.012', 'grad_norm': '2.742', 'learning_rate': '5e-05', 'epoch': '0.1063', 'num_input_tokens_seen': 8640387, 'train_runtime': '4370', 'train_tokens_per_second': '1977'} +{'loss': '1.136', 'grad_norm': '2.085', 'learning_rate': '5e-05', 'epoch': '0.1063', 'num_input_tokens_seen': 8642434, 'train_runtime': '4371', 'train_tokens_per_second': '1977'} +{'loss': '1.68', 'grad_norm': '3.61', 'learning_rate': '5e-05', 'epoch': '0.1063', 'num_input_tokens_seen': 8644481, 'train_runtime': '4372', 'train_tokens_per_second': '1977'} +{'loss': '0.4278', 'grad_norm': '1.59', 'learning_rate': '5e-05', 'epoch': '0.1064', 'num_input_tokens_seen': 8646528, 'train_runtime': '4373', 'train_tokens_per_second': '1977'} +{'loss': '0.3908', 'grad_norm': '1.273', 'learning_rate': '5e-05', 'epoch': '0.1064', 'num_input_tokens_seen': 8648575, 'train_runtime': '4374', 'train_tokens_per_second': '1977'} +{'loss': '0.4894', 'grad_norm': '1.927', 'learning_rate': '5e-05', 'epoch': '0.1064', 'num_input_tokens_seen': 8650622, 'train_runtime': '4375', 'train_tokens_per_second': '1977'} +{'loss': '1.229', 'grad_norm': '2.258', 'learning_rate': '5e-05', 'epoch': '0.1064', 'num_input_tokens_seen': 8652669, 'train_runtime': '4376', 'train_tokens_per_second': '1977'} +{'loss': '1.494', 'grad_norm': '2.385', 'learning_rate': '5e-05', 'epoch': '0.1065', 'num_input_tokens_seen': 8654716, 'train_runtime': '4377', 'train_tokens_per_second': '1977'} +{'loss': '0.7382', 'grad_norm': '1.753', 'learning_rate': '5e-05', 'epoch': '0.1065', 'num_input_tokens_seen': 8656763, 'train_runtime': '4378', 'train_tokens_per_second': '1977'} +{'loss': '0.3424', 'grad_norm': '1.605', 'learning_rate': '5e-05', 'epoch': '0.1065', 'num_input_tokens_seen': 8658810, 'train_runtime': '4379', 'train_tokens_per_second': '1977'} +{'loss': '0.5557', 'grad_norm': '1.344', 'learning_rate': '5e-05', 'epoch': '0.1065', 'num_input_tokens_seen': 8660857, 'train_runtime': '4380', 'train_tokens_per_second': '1977'} +{'loss': '0.7712', 'grad_norm': '1.612', 'learning_rate': '5e-05', 'epoch': '0.1066', 'num_input_tokens_seen': 8662904, 'train_runtime': '4382', 'train_tokens_per_second': '1977'} +{'loss': '1.793', 'grad_norm': '2.632', 'learning_rate': '5e-05', 'epoch': '0.1066', 'num_input_tokens_seen': 8664951, 'train_runtime': '4383', 'train_tokens_per_second': '1977'} +{'loss': '0.3644', 'grad_norm': '1.885', 'learning_rate': '5e-05', 'epoch': '0.1066', 'num_input_tokens_seen': 8666998, 'train_runtime': '4384', 'train_tokens_per_second': '1977'} +{'loss': '0.8444', 'grad_norm': '1.942', 'learning_rate': '5e-05', 'epoch': '0.1066', 'num_input_tokens_seen': 8669045, 'train_runtime': '4385', 'train_tokens_per_second': '1977'} +{'loss': '0.3452', 'grad_norm': '1.348', 'learning_rate': '5e-05', 'epoch': '0.1067', 'num_input_tokens_seen': 8671092, 'train_runtime': '4386', 'train_tokens_per_second': '1977'} +{'loss': '0.4735', 'grad_norm': '1.366', 'learning_rate': '5e-05', 'epoch': '0.1067', 'num_input_tokens_seen': 8673139, 'train_runtime': '4387', 'train_tokens_per_second': '1977'} +{'loss': '1.774', 'grad_norm': '2.735', 'learning_rate': '5e-05', 'epoch': '0.1067', 'num_input_tokens_seen': 8675186, 'train_runtime': '4388', 'train_tokens_per_second': '1977'} +{'loss': '0.393', 'grad_norm': '1.805', 'learning_rate': '5e-05', 'epoch': '0.1067', 'num_input_tokens_seen': 8677233, 'train_runtime': '4389', 'train_tokens_per_second': '1977'} +{'loss': '0.422', 'grad_norm': '1.35', 'learning_rate': '5e-05', 'epoch': '0.1068', 'num_input_tokens_seen': 8679280, 'train_runtime': '4390', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '2.132', 'learning_rate': '5e-05', 'epoch': '0.1068', 'num_input_tokens_seen': 8681327, 'train_runtime': '4391', 'train_tokens_per_second': '1977'} +{'loss': '1.12', 'grad_norm': '1.924', 'learning_rate': '5e-05', 'epoch': '0.1068', 'num_input_tokens_seen': 8683374, 'train_runtime': '4392', 'train_tokens_per_second': '1977'} +{'loss': '0.2928', 'grad_norm': '1.362', 'learning_rate': '5e-05', 'epoch': '0.1068', 'num_input_tokens_seen': 8685421, 'train_runtime': '4393', 'train_tokens_per_second': '1977'} +{'loss': '0.6212', 'grad_norm': '1.509', 'learning_rate': '5e-05', 'epoch': '0.1069', 'num_input_tokens_seen': 8687468, 'train_runtime': '4394', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '2.086', 'learning_rate': '5e-05', 'epoch': '0.1069', 'num_input_tokens_seen': 8689515, 'train_runtime': '4395', 'train_tokens_per_second': '1977'} +{'loss': '0.4429', 'grad_norm': '1.262', 'learning_rate': '5e-05', 'epoch': '0.1069', 'num_input_tokens_seen': 8691562, 'train_runtime': '4396', 'train_tokens_per_second': '1977'} +{'loss': '0.3679', 'grad_norm': '1.097', 'learning_rate': '5e-05', 'epoch': '0.1069', 'num_input_tokens_seen': 8693609, 'train_runtime': '4397', 'train_tokens_per_second': '1977'} +{'loss': '1.646', 'grad_norm': '2.29', 'learning_rate': '5e-05', 'epoch': '0.107', 'num_input_tokens_seen': 8695656, 'train_runtime': '4398', 'train_tokens_per_second': '1977'} +{'loss': '0.4154', 'grad_norm': '1.366', 'learning_rate': '5e-05', 'epoch': '0.107', 'num_input_tokens_seen': 8697703, 'train_runtime': '4399', 'train_tokens_per_second': '1977'} +{'loss': '0.8472', 'grad_norm': '1.632', 'learning_rate': '5e-05', 'epoch': '0.107', 'num_input_tokens_seen': 8699750, 'train_runtime': '4400', 'train_tokens_per_second': '1977'} +{'loss': '0.8456', 'grad_norm': '1.81', 'learning_rate': '5e-05', 'epoch': '0.107', 'num_input_tokens_seen': 8701797, 'train_runtime': '4401', 'train_tokens_per_second': '1977'} +{'loss': '0.3807', 'grad_norm': '1.116', 'learning_rate': '5e-05', 'epoch': '0.1071', 'num_input_tokens_seen': 8703844, 'train_runtime': '4402', 'train_tokens_per_second': '1977'} +{'loss': '0.6067', 'grad_norm': '1.194', 'learning_rate': '5e-05', 'epoch': '0.1071', 'num_input_tokens_seen': 8705891, 'train_runtime': '4403', 'train_tokens_per_second': '1977'} +{'loss': '1.256', 'grad_norm': '3.342', 'learning_rate': '5e-05', 'epoch': '0.1071', 'num_input_tokens_seen': 8707938, 'train_runtime': '4404', 'train_tokens_per_second': '1977'} +{'loss': '0.2737', 'grad_norm': '1.221', 'learning_rate': '5e-05', 'epoch': '0.1071', 'num_input_tokens_seen': 8709985, 'train_runtime': '4405', 'train_tokens_per_second': '1977'} +{'loss': '1.671', 'grad_norm': '2.454', 'learning_rate': '5e-05', 'epoch': '0.1072', 'num_input_tokens_seen': 8712032, 'train_runtime': '4406', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '2.088', 'learning_rate': '5e-05', 'epoch': '0.1072', 'num_input_tokens_seen': 8714079, 'train_runtime': '4407', 'train_tokens_per_second': '1977'} +{'loss': '0.4035', 'grad_norm': '1.31', 'learning_rate': '5e-05', 'epoch': '0.1072', 'num_input_tokens_seen': 8716126, 'train_runtime': '4408', 'train_tokens_per_second': '1977'} +{'loss': '1.155', 'grad_norm': '1.906', 'learning_rate': '5e-05', 'epoch': '0.1072', 'num_input_tokens_seen': 8718173, 'train_runtime': '4409', 'train_tokens_per_second': '1977'} +{'loss': '0.669', 'grad_norm': '1.822', 'learning_rate': '5e-05', 'epoch': '0.1073', 'num_input_tokens_seen': 8720220, 'train_runtime': '4411', 'train_tokens_per_second': '1977'} +{'loss': '0.9396', 'grad_norm': '1.951', 'learning_rate': '5e-05', 'epoch': '0.1073', 'num_input_tokens_seen': 8722267, 'train_runtime': '4412', 'train_tokens_per_second': '1977'} +{'loss': '0.8744', 'grad_norm': '1.737', 'learning_rate': '5e-05', 'epoch': '0.1073', 'num_input_tokens_seen': 8724314, 'train_runtime': '4413', 'train_tokens_per_second': '1977'} +{'loss': '2.1', 'grad_norm': '2.763', 'learning_rate': '5e-05', 'epoch': '0.1073', 'num_input_tokens_seen': 8726361, 'train_runtime': '4414', 'train_tokens_per_second': '1977'} +{'loss': '1.182', 'grad_norm': '1.945', 'learning_rate': '5e-05', 'epoch': '0.1074', 'num_input_tokens_seen': 8728408, 'train_runtime': '4415', 'train_tokens_per_second': '1977'} +{'loss': '0.344', 'grad_norm': '1.1', 'learning_rate': '5e-05', 'epoch': '0.1074', 'num_input_tokens_seen': 8730455, 'train_runtime': '4416', 'train_tokens_per_second': '1977'} +{'loss': '0.3197', 'grad_norm': '1.347', 'learning_rate': '5e-05', 'epoch': '0.1074', 'num_input_tokens_seen': 8732502, 'train_runtime': '4417', 'train_tokens_per_second': '1977'} +{'loss': '0.4947', 'grad_norm': '1.479', 'learning_rate': '5e-05', 'epoch': '0.1074', 'num_input_tokens_seen': 8734549, 'train_runtime': '4418', 'train_tokens_per_second': '1977'} +{'loss': '0.7035', 'grad_norm': '1.976', 'learning_rate': '5e-05', 'epoch': '0.1075', 'num_input_tokens_seen': 8736596, 'train_runtime': '4419', 'train_tokens_per_second': '1977'} +{'loss': '0.8783', 'grad_norm': '2.124', 'learning_rate': '5e-05', 'epoch': '0.1075', 'num_input_tokens_seen': 8738643, 'train_runtime': '4420', 'train_tokens_per_second': '1977'} +{'loss': '0.3551', 'grad_norm': '1.272', 'learning_rate': '5e-05', 'epoch': '0.1075', 'num_input_tokens_seen': 8740690, 'train_runtime': '4421', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '2.223', 'learning_rate': '5e-05', 'epoch': '0.1075', 'num_input_tokens_seen': 8742737, 'train_runtime': '4422', 'train_tokens_per_second': '1977'} +{'loss': '0.4401', 'grad_norm': '1.302', 'learning_rate': '5e-05', 'epoch': '0.1076', 'num_input_tokens_seen': 8744784, 'train_runtime': '4423', 'train_tokens_per_second': '1977'} +{'loss': '0.4765', 'grad_norm': '1.289', 'learning_rate': '5e-05', 'epoch': '0.1076', 'num_input_tokens_seen': 8746831, 'train_runtime': '4424', 'train_tokens_per_second': '1977'} +{'loss': '0.4293', 'grad_norm': '1.514', 'learning_rate': '5e-05', 'epoch': '0.1076', 'num_input_tokens_seen': 8748878, 'train_runtime': '4425', 'train_tokens_per_second': '1977'} +{'loss': '1.048', 'grad_norm': '2.163', 'learning_rate': '5e-05', 'epoch': '0.1076', 'num_input_tokens_seen': 8750925, 'train_runtime': '4426', 'train_tokens_per_second': '1977'} +{'loss': '1.743', 'grad_norm': '2.134', 'learning_rate': '5e-05', 'epoch': '0.1077', 'num_input_tokens_seen': 8752972, 'train_runtime': '4427', 'train_tokens_per_second': '1977'} +{'loss': '0.3727', 'grad_norm': '1.19', 'learning_rate': '5e-05', 'epoch': '0.1077', 'num_input_tokens_seen': 8755019, 'train_runtime': '4428', 'train_tokens_per_second': '1977'} +{'loss': '1.003', 'grad_norm': '1.813', 'learning_rate': '5e-05', 'epoch': '0.1077', 'num_input_tokens_seen': 8757066, 'train_runtime': '4429', 'train_tokens_per_second': '1977'} +{'loss': '0.9053', 'grad_norm': '1.857', 'learning_rate': '5e-05', 'epoch': '0.1077', 'num_input_tokens_seen': 8759113, 'train_runtime': '4430', 'train_tokens_per_second': '1977'} +{'loss': '0.4526', 'grad_norm': '1.524', 'learning_rate': '5e-05', 'epoch': '0.1078', 'num_input_tokens_seen': 8761160, 'train_runtime': '4431', 'train_tokens_per_second': '1977'} +{'loss': '0.9041', 'grad_norm': '1.666', 'learning_rate': '5e-05', 'epoch': '0.1078', 'num_input_tokens_seen': 8763207, 'train_runtime': '4432', 'train_tokens_per_second': '1977'} +{'loss': '1.509', 'grad_norm': '2.322', 'learning_rate': '5e-05', 'epoch': '0.1078', 'num_input_tokens_seen': 8765254, 'train_runtime': '4433', 'train_tokens_per_second': '1977'} +{'loss': '1.026', 'grad_norm': '1.89', 'learning_rate': '5e-05', 'epoch': '0.1078', 'num_input_tokens_seen': 8767301, 'train_runtime': '4434', 'train_tokens_per_second': '1977'} +{'loss': '1.094', 'grad_norm': '2.158', 'learning_rate': '5e-05', 'epoch': '0.1079', 'num_input_tokens_seen': 8769348, 'train_runtime': '4435', 'train_tokens_per_second': '1977'} +{'loss': '0.4933', 'grad_norm': '1.538', 'learning_rate': '5e-05', 'epoch': '0.1079', 'num_input_tokens_seen': 8771395, 'train_runtime': '4436', 'train_tokens_per_second': '1977'} +{'loss': '0.5304', 'grad_norm': '1.485', 'learning_rate': '5e-05', 'epoch': '0.1079', 'num_input_tokens_seen': 8773442, 'train_runtime': '4437', 'train_tokens_per_second': '1977'} +{'loss': '0.4951', 'grad_norm': '1.666', 'learning_rate': '5e-05', 'epoch': '0.1079', 'num_input_tokens_seen': 8775489, 'train_runtime': '4438', 'train_tokens_per_second': '1977'} +{'loss': '0.7941', 'grad_norm': '1.923', 'learning_rate': '5e-05', 'epoch': '0.108', 'num_input_tokens_seen': 8777536, 'train_runtime': '4439', 'train_tokens_per_second': '1977'} +{'loss': '0.4731', 'grad_norm': '1.499', 'learning_rate': '5e-05', 'epoch': '0.108', 'num_input_tokens_seen': 8779583, 'train_runtime': '4440', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.256', 'learning_rate': '5e-05', 'epoch': '0.108', 'num_input_tokens_seen': 8781630, 'train_runtime': '4442', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '1.921', 'learning_rate': '5e-05', 'epoch': '0.108', 'num_input_tokens_seen': 8783677, 'train_runtime': '4443', 'train_tokens_per_second': '1977'} +{'loss': '0.3694', 'grad_norm': '1.274', 'learning_rate': '5e-05', 'epoch': '0.1081', 'num_input_tokens_seen': 8785724, 'train_runtime': '4444', 'train_tokens_per_second': '1977'} +{'loss': '0.4586', 'grad_norm': '1.436', 'learning_rate': '5e-05', 'epoch': '0.1081', 'num_input_tokens_seen': 8787771, 'train_runtime': '4445', 'train_tokens_per_second': '1977'} +{'loss': '0.3824', 'grad_norm': '1.115', 'learning_rate': '5e-05', 'epoch': '0.1081', 'num_input_tokens_seen': 8789818, 'train_runtime': '4446', 'train_tokens_per_second': '1977'} +{'loss': '0.4566', 'grad_norm': '1.463', 'learning_rate': '5e-05', 'epoch': '0.1081', 'num_input_tokens_seen': 8791865, 'train_runtime': '4447', 'train_tokens_per_second': '1977'} +{'loss': '0.8187', 'grad_norm': '1.604', 'learning_rate': '5e-05', 'epoch': '0.1082', 'num_input_tokens_seen': 8793912, 'train_runtime': '4448', 'train_tokens_per_second': '1977'} +{'loss': '0.848', 'grad_norm': '1.935', 'learning_rate': '5e-05', 'epoch': '0.1082', 'num_input_tokens_seen': 8795959, 'train_runtime': '4449', 'train_tokens_per_second': '1977'} +{'loss': '0.3522', 'grad_norm': '1.286', 'learning_rate': '5e-05', 'epoch': '0.1082', 'num_input_tokens_seen': 8798006, 'train_runtime': '4450', 'train_tokens_per_second': '1977'} +{'loss': '0.3648', 'grad_norm': '1.29', 'learning_rate': '5e-05', 'epoch': '0.1082', 'num_input_tokens_seen': 8800053, 'train_runtime': '4451', 'train_tokens_per_second': '1977'} +{'loss': '1.296', 'grad_norm': '2.4', 'learning_rate': '5e-05', 'epoch': '0.1083', 'num_input_tokens_seen': 8802100, 'train_runtime': '4452', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '2.758', 'learning_rate': '5e-05', 'epoch': '0.1083', 'num_input_tokens_seen': 8804147, 'train_runtime': '4453', 'train_tokens_per_second': '1977'} +{'loss': '0.4273', 'grad_norm': '1.564', 'learning_rate': '5e-05', 'epoch': '0.1083', 'num_input_tokens_seen': 8806194, 'train_runtime': '4454', 'train_tokens_per_second': '1977'} +{'loss': '0.7784', 'grad_norm': '1.812', 'learning_rate': '5e-05', 'epoch': '0.1083', 'num_input_tokens_seen': 8808241, 'train_runtime': '4455', 'train_tokens_per_second': '1977'} +{'loss': '0.3572', 'grad_norm': '1.365', 'learning_rate': '5e-05', 'epoch': '0.1084', 'num_input_tokens_seen': 8810288, 'train_runtime': '4456', 'train_tokens_per_second': '1977'} +{'loss': '1.32', 'grad_norm': '2.09', 'learning_rate': '5e-05', 'epoch': '0.1084', 'num_input_tokens_seen': 8812335, 'train_runtime': '4457', 'train_tokens_per_second': '1977'} +{'loss': '0.921', 'grad_norm': '1.639', 'learning_rate': '5e-05', 'epoch': '0.1084', 'num_input_tokens_seen': 8814382, 'train_runtime': '4458', 'train_tokens_per_second': '1977'} +{'loss': '0.9867', 'grad_norm': '1.61', 'learning_rate': '5e-05', 'epoch': '0.1084', 'num_input_tokens_seen': 8816429, 'train_runtime': '4459', 'train_tokens_per_second': '1977'} +{'loss': '0.4691', 'grad_norm': '1.564', 'learning_rate': '5e-05', 'epoch': '0.1085', 'num_input_tokens_seen': 8818476, 'train_runtime': '4460', 'train_tokens_per_second': '1977'} +{'loss': '0.4113', 'grad_norm': '1.247', 'learning_rate': '5e-05', 'epoch': '0.1085', 'num_input_tokens_seen': 8820523, 'train_runtime': '4461', 'train_tokens_per_second': '1977'} +{'loss': '0.8502', 'grad_norm': '1.603', 'learning_rate': '5e-05', 'epoch': '0.1085', 'num_input_tokens_seen': 8822570, 'train_runtime': '4462', 'train_tokens_per_second': '1977'} +{'loss': '0.67', 'grad_norm': '1.484', 'learning_rate': '5e-05', 'epoch': '0.1085', 'num_input_tokens_seen': 8824617, 'train_runtime': '4463', 'train_tokens_per_second': '1977'} +{'loss': '0.361', 'grad_norm': '1.135', 'learning_rate': '5e-05', 'epoch': '0.1086', 'num_input_tokens_seen': 8826664, 'train_runtime': '4464', 'train_tokens_per_second': '1977'} +{'loss': '0.9928', 'grad_norm': '1.912', 'learning_rate': '5e-05', 'epoch': '0.1086', 'num_input_tokens_seen': 8828711, 'train_runtime': '4465', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '2.004', 'learning_rate': '5e-05', 'epoch': '0.1086', 'num_input_tokens_seen': 8830758, 'train_runtime': '4466', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '2.267', 'learning_rate': '5e-05', 'epoch': '0.1086', 'num_input_tokens_seen': 8832805, 'train_runtime': '4467', 'train_tokens_per_second': '1977'} +{'loss': '1.009', 'grad_norm': '1.941', 'learning_rate': '5e-05', 'epoch': '0.1087', 'num_input_tokens_seen': 8834852, 'train_runtime': '4468', 'train_tokens_per_second': '1977'} +{'loss': '0.2782', 'grad_norm': '1.286', 'learning_rate': '5e-05', 'epoch': '0.1087', 'num_input_tokens_seen': 8836899, 'train_runtime': '4469', 'train_tokens_per_second': '1977'} +{'loss': '0.9708', 'grad_norm': '2.156', 'learning_rate': '5e-05', 'epoch': '0.1087', 'num_input_tokens_seen': 8838946, 'train_runtime': '4470', 'train_tokens_per_second': '1977'} +{'loss': '1.159', 'grad_norm': '2.051', 'learning_rate': '5e-05', 'epoch': '0.1087', 'num_input_tokens_seen': 8840993, 'train_runtime': '4472', 'train_tokens_per_second': '1977'} +{'loss': '0.3538', 'grad_norm': '1.343', 'learning_rate': '5e-05', 'epoch': '0.1088', 'num_input_tokens_seen': 8843040, 'train_runtime': '4473', 'train_tokens_per_second': '1977'} +{'loss': '0.4138', 'grad_norm': '1.127', 'learning_rate': '5e-05', 'epoch': '0.1088', 'num_input_tokens_seen': 8845087, 'train_runtime': '4474', 'train_tokens_per_second': '1977'} +{'loss': '0.7944', 'grad_norm': '1.633', 'learning_rate': '5e-05', 'epoch': '0.1088', 'num_input_tokens_seen': 8847134, 'train_runtime': '4475', 'train_tokens_per_second': '1977'} +{'loss': '0.7558', 'grad_norm': '1.591', 'learning_rate': '5e-05', 'epoch': '0.1088', 'num_input_tokens_seen': 8849181, 'train_runtime': '4476', 'train_tokens_per_second': '1977'} +{'loss': '1.252', 'grad_norm': '1.867', 'learning_rate': '5e-05', 'epoch': '0.1089', 'num_input_tokens_seen': 8851228, 'train_runtime': '4477', 'train_tokens_per_second': '1977'} +{'loss': '0.3322', 'grad_norm': '1.112', 'learning_rate': '5e-05', 'epoch': '0.1089', 'num_input_tokens_seen': 8853275, 'train_runtime': '4478', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '1.747', 'learning_rate': '5e-05', 'epoch': '0.1089', 'num_input_tokens_seen': 8855322, 'train_runtime': '4479', 'train_tokens_per_second': '1977'} +{'loss': '0.426', 'grad_norm': '1.246', 'learning_rate': '5e-05', 'epoch': '0.1089', 'num_input_tokens_seen': 8857369, 'train_runtime': '4480', 'train_tokens_per_second': '1977'} +{'loss': '0.9176', 'grad_norm': '1.63', 'learning_rate': '5e-05', 'epoch': '0.109', 'num_input_tokens_seen': 8859416, 'train_runtime': '4481', 'train_tokens_per_second': '1977'} +{'loss': '0.4584', 'grad_norm': '1.519', 'learning_rate': '5e-05', 'epoch': '0.109', 'num_input_tokens_seen': 8861463, 'train_runtime': '4482', 'train_tokens_per_second': '1977'} +{'loss': '0.6866', 'grad_norm': '1.69', 'learning_rate': '5e-05', 'epoch': '0.109', 'num_input_tokens_seen': 8863510, 'train_runtime': '4483', 'train_tokens_per_second': '1977'} +{'loss': '0.4569', 'grad_norm': '1.49', 'learning_rate': '5e-05', 'epoch': '0.109', 'num_input_tokens_seen': 8865557, 'train_runtime': '4484', 'train_tokens_per_second': '1977'} +{'loss': '0.7202', 'grad_norm': '1.649', 'learning_rate': '5e-05', 'epoch': '0.1091', 'num_input_tokens_seen': 8867604, 'train_runtime': '4485', 'train_tokens_per_second': '1977'} +{'loss': '2.409', 'grad_norm': '3.312', 'learning_rate': '5e-05', 'epoch': '0.1091', 'num_input_tokens_seen': 8869651, 'train_runtime': '4486', 'train_tokens_per_second': '1977'} +{'loss': '1.216', 'grad_norm': '2.219', 'learning_rate': '5e-05', 'epoch': '0.1091', 'num_input_tokens_seen': 8871698, 'train_runtime': '4487', 'train_tokens_per_second': '1977'} +{'loss': '0.8272', 'grad_norm': '1.57', 'learning_rate': '5e-05', 'epoch': '0.1091', 'num_input_tokens_seen': 8873745, 'train_runtime': '4488', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '1.959', 'learning_rate': '5e-05', 'epoch': '0.1092', 'num_input_tokens_seen': 8875792, 'train_runtime': '4489', 'train_tokens_per_second': '1977'} +{'loss': '0.5849', 'grad_norm': '1.709', 'learning_rate': '5e-05', 'epoch': '0.1092', 'num_input_tokens_seen': 8877839, 'train_runtime': '4490', 'train_tokens_per_second': '1977'} +{'loss': '0.7549', 'grad_norm': '1.758', 'learning_rate': '5e-05', 'epoch': '0.1092', 'num_input_tokens_seen': 8879886, 'train_runtime': '4491', 'train_tokens_per_second': '1977'} +{'loss': '0.7847', 'grad_norm': '2.115', 'learning_rate': '5e-05', 'epoch': '0.1092', 'num_input_tokens_seen': 8881933, 'train_runtime': '4492', 'train_tokens_per_second': '1977'} +{'loss': '0.5397', 'grad_norm': '1.339', 'learning_rate': '5e-05', 'epoch': '0.1093', 'num_input_tokens_seen': 8883980, 'train_runtime': '4493', 'train_tokens_per_second': '1977'} +{'loss': '0.4108', 'grad_norm': '0.9816', 'learning_rate': '5e-05', 'epoch': '0.1093', 'num_input_tokens_seen': 8886027, 'train_runtime': '4494', 'train_tokens_per_second': '1977'} +{'loss': '1.503', 'grad_norm': '2.305', 'learning_rate': '5e-05', 'epoch': '0.1093', 'num_input_tokens_seen': 8888074, 'train_runtime': '4495', 'train_tokens_per_second': '1977'} +{'loss': '0.3738', 'grad_norm': '1.105', 'learning_rate': '5e-05', 'epoch': '0.1093', 'num_input_tokens_seen': 8890121, 'train_runtime': '4496', 'train_tokens_per_second': '1977'} +{'loss': '0.7665', 'grad_norm': '1.461', 'learning_rate': '5e-05', 'epoch': '0.1094', 'num_input_tokens_seen': 8892168, 'train_runtime': '4497', 'train_tokens_per_second': '1977'} +{'loss': '0.8164', 'grad_norm': '1.675', 'learning_rate': '5e-05', 'epoch': '0.1094', 'num_input_tokens_seen': 8894215, 'train_runtime': '4498', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '2.189', 'learning_rate': '5e-05', 'epoch': '0.1094', 'num_input_tokens_seen': 8896262, 'train_runtime': '4499', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '1.695', 'learning_rate': '5e-05', 'epoch': '0.1094', 'num_input_tokens_seen': 8898309, 'train_runtime': '4500', 'train_tokens_per_second': '1977'} +{'loss': '1.081', 'grad_norm': '2.103', 'learning_rate': '5e-05', 'epoch': '0.1095', 'num_input_tokens_seen': 8900356, 'train_runtime': '4501', 'train_tokens_per_second': '1977'} +{'loss': '2.311', 'grad_norm': '2.342', 'learning_rate': '5e-05', 'epoch': '0.1095', 'num_input_tokens_seen': 8902403, 'train_runtime': '4503', 'train_tokens_per_second': '1977'} +{'loss': '1.325', 'grad_norm': '2.509', 'learning_rate': '5e-05', 'epoch': '0.1095', 'num_input_tokens_seen': 8904450, 'train_runtime': '4504', 'train_tokens_per_second': '1977'} +{'loss': '1.158', 'grad_norm': '2.271', 'learning_rate': '5e-05', 'epoch': '0.1096', 'num_input_tokens_seen': 8906497, 'train_runtime': '4505', 'train_tokens_per_second': '1977'} +{'loss': '1.294', 'grad_norm': '2.113', 'learning_rate': '5e-05', 'epoch': '0.1096', 'num_input_tokens_seen': 8908544, 'train_runtime': '4506', 'train_tokens_per_second': '1977'} +{'loss': '1.444', 'grad_norm': '1.935', 'learning_rate': '5e-05', 'epoch': '0.1096', 'num_input_tokens_seen': 8910591, 'train_runtime': '4507', 'train_tokens_per_second': '1977'} +{'loss': '0.8807', 'grad_norm': '1.871', 'learning_rate': '5e-05', 'epoch': '0.1096', 'num_input_tokens_seen': 8912638, 'train_runtime': '4508', 'train_tokens_per_second': '1977'} +{'loss': '0.5862', 'grad_norm': '1.742', 'learning_rate': '5e-05', 'epoch': '0.1097', 'num_input_tokens_seen': 8914685, 'train_runtime': '4509', 'train_tokens_per_second': '1977'} +{'loss': '0.8747', 'grad_norm': '1.478', 'learning_rate': '5e-05', 'epoch': '0.1097', 'num_input_tokens_seen': 8916732, 'train_runtime': '4510', 'train_tokens_per_second': '1977'} +{'loss': '1.187', 'grad_norm': '1.811', 'learning_rate': '5e-05', 'epoch': '0.1097', 'num_input_tokens_seen': 8918779, 'train_runtime': '4511', 'train_tokens_per_second': '1977'} +{'loss': '0.5285', 'grad_norm': '1.622', 'learning_rate': '5e-05', 'epoch': '0.1097', 'num_input_tokens_seen': 8920826, 'train_runtime': '4512', 'train_tokens_per_second': '1977'} +{'loss': '0.2815', 'grad_norm': '1.201', 'learning_rate': '5e-05', 'epoch': '0.1098', 'num_input_tokens_seen': 8922873, 'train_runtime': '4513', 'train_tokens_per_second': '1977'} +{'loss': '0.8198', 'grad_norm': '1.788', 'learning_rate': '5e-05', 'epoch': '0.1098', 'num_input_tokens_seen': 8924920, 'train_runtime': '4514', 'train_tokens_per_second': '1977'} +{'loss': '0.4135', 'grad_norm': '1.281', 'learning_rate': '5e-05', 'epoch': '0.1098', 'num_input_tokens_seen': 8926967, 'train_runtime': '4515', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '2.202', 'learning_rate': '5e-05', 'epoch': '0.1098', 'num_input_tokens_seen': 8929014, 'train_runtime': '4516', 'train_tokens_per_second': '1977'} +{'loss': '1.555', 'grad_norm': '2.17', 'learning_rate': '5e-05', 'epoch': '0.1099', 'num_input_tokens_seen': 8931061, 'train_runtime': '4517', 'train_tokens_per_second': '1977'} +{'loss': '0.4644', 'grad_norm': '1.623', 'learning_rate': '5e-05', 'epoch': '0.1099', 'num_input_tokens_seen': 8933108, 'train_runtime': '4518', 'train_tokens_per_second': '1977'} +{'loss': '1.371', 'grad_norm': '2.178', 'learning_rate': '5e-05', 'epoch': '0.1099', 'num_input_tokens_seen': 8935155, 'train_runtime': '4519', 'train_tokens_per_second': '1977'} +{'loss': '0.8595', 'grad_norm': '1.748', 'learning_rate': '5e-05', 'epoch': '0.1099', 'num_input_tokens_seen': 8937202, 'train_runtime': '4520', 'train_tokens_per_second': '1977'} +{'loss': '0.4005', 'grad_norm': '1.085', 'learning_rate': '5e-05', 'epoch': '0.11', 'num_input_tokens_seen': 8939249, 'train_runtime': '4521', 'train_tokens_per_second': '1977'} +{'loss': '1.98', 'grad_norm': '3.078', 'learning_rate': '5e-05', 'epoch': '0.11', 'num_input_tokens_seen': 8941296, 'train_runtime': '4522', 'train_tokens_per_second': '1977'} +{'loss': '0.7879', 'grad_norm': '1.559', 'learning_rate': '5e-05', 'epoch': '0.11', 'num_input_tokens_seen': 8943343, 'train_runtime': '4523', 'train_tokens_per_second': '1977'} +{'loss': '0.4611', 'grad_norm': '1.603', 'learning_rate': '5e-05', 'epoch': '0.11', 'num_input_tokens_seen': 8945390, 'train_runtime': '4524', 'train_tokens_per_second': '1977'} +{'loss': '0.6717', 'grad_norm': '1.622', 'learning_rate': '5e-05', 'epoch': '0.1101', 'num_input_tokens_seen': 8947437, 'train_runtime': '4525', 'train_tokens_per_second': '1977'} +{'loss': '0.6308', 'grad_norm': '1.229', 'learning_rate': '5e-05', 'epoch': '0.1101', 'num_input_tokens_seen': 8949484, 'train_runtime': '4526', 'train_tokens_per_second': '1977'} +{'loss': '0.4319', 'grad_norm': '1.239', 'learning_rate': '5e-05', 'epoch': '0.1101', 'num_input_tokens_seen': 8951531, 'train_runtime': '4527', 'train_tokens_per_second': '1977'} +{'loss': '0.8613', 'grad_norm': '1.903', 'learning_rate': '5e-05', 'epoch': '0.1101', 'num_input_tokens_seen': 8953578, 'train_runtime': '4528', 'train_tokens_per_second': '1977'} +{'loss': '0.9984', 'grad_norm': '1.943', 'learning_rate': '5e-05', 'epoch': '0.1102', 'num_input_tokens_seen': 8955625, 'train_runtime': '4529', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '2.186', 'learning_rate': '5e-05', 'epoch': '0.1102', 'num_input_tokens_seen': 8957672, 'train_runtime': '4531', 'train_tokens_per_second': '1977'} +{'loss': '0.8381', 'grad_norm': '2.04', 'learning_rate': '5e-05', 'epoch': '0.1102', 'num_input_tokens_seen': 8959719, 'train_runtime': '4532', 'train_tokens_per_second': '1977'} +{'loss': '1.835', 'grad_norm': '2.322', 'learning_rate': '5e-05', 'epoch': '0.1102', 'num_input_tokens_seen': 8961766, 'train_runtime': '4533', 'train_tokens_per_second': '1977'} +{'loss': '1.74', 'grad_norm': '2.073', 'learning_rate': '5e-05', 'epoch': '0.1103', 'num_input_tokens_seen': 8963813, 'train_runtime': '4534', 'train_tokens_per_second': '1977'} +{'loss': '0.5957', 'grad_norm': '1.576', 'learning_rate': '5e-05', 'epoch': '0.1103', 'num_input_tokens_seen': 8965860, 'train_runtime': '4535', 'train_tokens_per_second': '1977'} +{'loss': '0.7764', 'grad_norm': '1.964', 'learning_rate': '5e-05', 'epoch': '0.1103', 'num_input_tokens_seen': 8967907, 'train_runtime': '4536', 'train_tokens_per_second': '1977'} +{'loss': '0.3597', 'grad_norm': '1.229', 'learning_rate': '5e-05', 'epoch': '0.1103', 'num_input_tokens_seen': 8969954, 'train_runtime': '4537', 'train_tokens_per_second': '1977'} +{'loss': '0.7117', 'grad_norm': '1.492', 'learning_rate': '5e-05', 'epoch': '0.1104', 'num_input_tokens_seen': 8972001, 'train_runtime': '4538', 'train_tokens_per_second': '1977'} +{'loss': '0.3288', 'grad_norm': '1.177', 'learning_rate': '5e-05', 'epoch': '0.1104', 'num_input_tokens_seen': 8974048, 'train_runtime': '4539', 'train_tokens_per_second': '1977'} +{'loss': '0.8468', 'grad_norm': '1.538', 'learning_rate': '5e-05', 'epoch': '0.1104', 'num_input_tokens_seen': 8976095, 'train_runtime': '4540', 'train_tokens_per_second': '1977'} +{'loss': '0.8409', 'grad_norm': '1.868', 'learning_rate': '5e-05', 'epoch': '0.1104', 'num_input_tokens_seen': 8978142, 'train_runtime': '4541', 'train_tokens_per_second': '1977'} +{'loss': '0.9418', 'grad_norm': '1.94', 'learning_rate': '5e-05', 'epoch': '0.1105', 'num_input_tokens_seen': 8980189, 'train_runtime': '4542', 'train_tokens_per_second': '1977'} +{'loss': '1.134', 'grad_norm': '1.876', 'learning_rate': '5e-05', 'epoch': '0.1105', 'num_input_tokens_seen': 8982236, 'train_runtime': '4543', 'train_tokens_per_second': '1977'} +{'loss': '0.9116', 'grad_norm': '1.666', 'learning_rate': '5e-05', 'epoch': '0.1105', 'num_input_tokens_seen': 8984283, 'train_runtime': '4544', 'train_tokens_per_second': '1977'} +{'loss': '1.447', 'grad_norm': '2.389', 'learning_rate': '5e-05', 'epoch': '0.1105', 'num_input_tokens_seen': 8986330, 'train_runtime': '4545', 'train_tokens_per_second': '1977'} +{'loss': '1.272', 'grad_norm': '1.788', 'learning_rate': '5e-05', 'epoch': '0.1106', 'num_input_tokens_seen': 8988377, 'train_runtime': '4546', 'train_tokens_per_second': '1977'} +{'loss': '0.9151', 'grad_norm': '2.073', 'learning_rate': '5e-05', 'epoch': '0.1106', 'num_input_tokens_seen': 8990424, 'train_runtime': '4547', 'train_tokens_per_second': '1977'} +{'loss': '1.656', 'grad_norm': '2.542', 'learning_rate': '5e-05', 'epoch': '0.1106', 'num_input_tokens_seen': 8992471, 'train_runtime': '4548', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '1.958', 'learning_rate': '5e-05', 'epoch': '0.1106', 'num_input_tokens_seen': 8994518, 'train_runtime': '4549', 'train_tokens_per_second': '1977'} +{'loss': '2.505', 'grad_norm': '2.907', 'learning_rate': '5e-05', 'epoch': '0.1107', 'num_input_tokens_seen': 8996565, 'train_runtime': '4550', 'train_tokens_per_second': '1977'} +{'loss': '1.238', 'grad_norm': '1.985', 'learning_rate': '5e-05', 'epoch': '0.1107', 'num_input_tokens_seen': 8998612, 'train_runtime': '4551', 'train_tokens_per_second': '1977'} +{'loss': '0.8924', 'grad_norm': '1.706', 'learning_rate': '5e-05', 'epoch': '0.1107', 'num_input_tokens_seen': 9000659, 'train_runtime': '4552', 'train_tokens_per_second': '1977'} +{'loss': '1.112', 'grad_norm': '2.006', 'learning_rate': '5e-05', 'epoch': '0.1107', 'num_input_tokens_seen': 9002706, 'train_runtime': '4553', 'train_tokens_per_second': '1977'} +{'loss': '1.676', 'grad_norm': '2.618', 'learning_rate': '5e-05', 'epoch': '0.1108', 'num_input_tokens_seen': 9004753, 'train_runtime': '4554', 'train_tokens_per_second': '1977'} +{'loss': '2.052', 'grad_norm': '2.562', 'learning_rate': '5e-05', 'epoch': '0.1108', 'num_input_tokens_seen': 9006800, 'train_runtime': '4555', 'train_tokens_per_second': '1977'} +{'loss': '1.805', 'grad_norm': '2.485', 'learning_rate': '5e-05', 'epoch': '0.1108', 'num_input_tokens_seen': 9008847, 'train_runtime': '4556', 'train_tokens_per_second': '1977'} +{'loss': '0.2769', 'grad_norm': '1.048', 'learning_rate': '5e-05', 'epoch': '0.1108', 'num_input_tokens_seen': 9010894, 'train_runtime': '4557', 'train_tokens_per_second': '1977'} +{'loss': '0.3262', 'grad_norm': '1.122', 'learning_rate': '5e-05', 'epoch': '0.1109', 'num_input_tokens_seen': 9012941, 'train_runtime': '4558', 'train_tokens_per_second': '1977'} +{'loss': '1.176', 'grad_norm': '1.922', 'learning_rate': '5e-05', 'epoch': '0.1109', 'num_input_tokens_seen': 9014988, 'train_runtime': '4559', 'train_tokens_per_second': '1977'} +{'loss': '0.7441', 'grad_norm': '1.804', 'learning_rate': '5e-05', 'epoch': '0.1109', 'num_input_tokens_seen': 9017035, 'train_runtime': '4561', 'train_tokens_per_second': '1977'} +{'loss': '0.604', 'grad_norm': '1.371', 'learning_rate': '5e-05', 'epoch': '0.1109', 'num_input_tokens_seen': 9019082, 'train_runtime': '4562', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '1.917', 'learning_rate': '5e-05', 'epoch': '0.111', 'num_input_tokens_seen': 9021129, 'train_runtime': '4563', 'train_tokens_per_second': '1977'} +{'loss': '1.199', 'grad_norm': '2.249', 'learning_rate': '5e-05', 'epoch': '0.111', 'num_input_tokens_seen': 9023176, 'train_runtime': '4564', 'train_tokens_per_second': '1977'} +{'loss': '0.4646', 'grad_norm': '1.339', 'learning_rate': '5e-05', 'epoch': '0.111', 'num_input_tokens_seen': 9025223, 'train_runtime': '4565', 'train_tokens_per_second': '1977'} +{'loss': '0.4565', 'grad_norm': '1.31', 'learning_rate': '5e-05', 'epoch': '0.111', 'num_input_tokens_seen': 9027270, 'train_runtime': '4566', 'train_tokens_per_second': '1977'} +{'loss': '0.7357', 'grad_norm': '1.635', 'learning_rate': '5e-05', 'epoch': '0.1111', 'num_input_tokens_seen': 9029317, 'train_runtime': '4567', 'train_tokens_per_second': '1977'} +{'loss': '0.4377', 'grad_norm': '1.441', 'learning_rate': '5e-05', 'epoch': '0.1111', 'num_input_tokens_seen': 9031364, 'train_runtime': '4568', 'train_tokens_per_second': '1977'} +{'loss': '2.582', 'grad_norm': '2.691', 'learning_rate': '5e-05', 'epoch': '0.1111', 'num_input_tokens_seen': 9033411, 'train_runtime': '4569', 'train_tokens_per_second': '1977'} +{'loss': '0.5379', 'grad_norm': '1.659', 'learning_rate': '5e-05', 'epoch': '0.1111', 'num_input_tokens_seen': 9035458, 'train_runtime': '4570', 'train_tokens_per_second': '1977'} +{'loss': '1.541', 'grad_norm': '2.446', 'learning_rate': '5e-05', 'epoch': '0.1112', 'num_input_tokens_seen': 9037505, 'train_runtime': '4571', 'train_tokens_per_second': '1977'} +{'loss': '0.3843', 'grad_norm': '1.072', 'learning_rate': '5e-05', 'epoch': '0.1112', 'num_input_tokens_seen': 9039552, 'train_runtime': '4572', 'train_tokens_per_second': '1977'} +{'loss': '0.6928', 'grad_norm': '1.687', 'learning_rate': '5e-05', 'epoch': '0.1112', 'num_input_tokens_seen': 9041599, 'train_runtime': '4573', 'train_tokens_per_second': '1977'} +{'loss': '0.6224', 'grad_norm': '1.592', 'learning_rate': '5e-05', 'epoch': '0.1112', 'num_input_tokens_seen': 9043646, 'train_runtime': '4574', 'train_tokens_per_second': '1977'} +{'loss': '0.8942', 'grad_norm': '1.714', 'learning_rate': '5e-05', 'epoch': '0.1113', 'num_input_tokens_seen': 9045693, 'train_runtime': '4575', 'train_tokens_per_second': '1977'} +{'loss': '0.9547', 'grad_norm': '1.9', 'learning_rate': '5e-05', 'epoch': '0.1113', 'num_input_tokens_seen': 9047740, 'train_runtime': '4576', 'train_tokens_per_second': '1977'} +{'loss': '1.788', 'grad_norm': '2.385', 'learning_rate': '5e-05', 'epoch': '0.1113', 'num_input_tokens_seen': 9049787, 'train_runtime': '4577', 'train_tokens_per_second': '1977'} +{'loss': '0.5334', 'grad_norm': '1.578', 'learning_rate': '5e-05', 'epoch': '0.1113', 'num_input_tokens_seen': 9051834, 'train_runtime': '4578', 'train_tokens_per_second': '1977'} +{'loss': '0.8086', 'grad_norm': '1.437', 'learning_rate': '5e-05', 'epoch': '0.1114', 'num_input_tokens_seen': 9053881, 'train_runtime': '4579', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.055', 'learning_rate': '5e-05', 'epoch': '0.1114', 'num_input_tokens_seen': 9055928, 'train_runtime': '4580', 'train_tokens_per_second': '1977'} +{'loss': '0.4784', 'grad_norm': '1.095', 'learning_rate': '5e-05', 'epoch': '0.1114', 'num_input_tokens_seen': 9057975, 'train_runtime': '4581', 'train_tokens_per_second': '1977'} +{'loss': '0.5014', 'grad_norm': '1.212', 'learning_rate': '5e-05', 'epoch': '0.1114', 'num_input_tokens_seen': 9060022, 'train_runtime': '4582', 'train_tokens_per_second': '1977'} +{'loss': '0.9633', 'grad_norm': '1.771', 'learning_rate': '5e-05', 'epoch': '0.1115', 'num_input_tokens_seen': 9062069, 'train_runtime': '4583', 'train_tokens_per_second': '1977'} +{'loss': '0.767', 'grad_norm': '1.738', 'learning_rate': '5e-05', 'epoch': '0.1115', 'num_input_tokens_seen': 9064116, 'train_runtime': '4584', 'train_tokens_per_second': '1977'} +{'loss': '1.584', 'grad_norm': '2.124', 'learning_rate': '5e-05', 'epoch': '0.1115', 'num_input_tokens_seen': 9066163, 'train_runtime': '4585', 'train_tokens_per_second': '1977'} +{'loss': '0.9264', 'grad_norm': '1.833', 'learning_rate': '5e-05', 'epoch': '0.1115', 'num_input_tokens_seen': 9068210, 'train_runtime': '4586', 'train_tokens_per_second': '1977'} +{'loss': '0.3127', 'grad_norm': '1.134', 'learning_rate': '5e-05', 'epoch': '0.1116', 'num_input_tokens_seen': 9070257, 'train_runtime': '4587', 'train_tokens_per_second': '1977'} +{'loss': '0.7408', 'grad_norm': '1.731', 'learning_rate': '5e-05', 'epoch': '0.1116', 'num_input_tokens_seen': 9072304, 'train_runtime': '4588', 'train_tokens_per_second': '1977'} +{'loss': '0.4938', 'grad_norm': '1.352', 'learning_rate': '5e-05', 'epoch': '0.1116', 'num_input_tokens_seen': 9074351, 'train_runtime': '4589', 'train_tokens_per_second': '1977'} +{'loss': '2.283', 'grad_norm': '2.606', 'learning_rate': '5e-05', 'epoch': '0.1116', 'num_input_tokens_seen': 9076398, 'train_runtime': '4591', 'train_tokens_per_second': '1977'} +{'loss': '0.808', 'grad_norm': '1.766', 'learning_rate': '5e-05', 'epoch': '0.1117', 'num_input_tokens_seen': 9078445, 'train_runtime': '4592', 'train_tokens_per_second': '1977'} +{'loss': '0.9725', 'grad_norm': '1.697', 'learning_rate': '5e-05', 'epoch': '0.1117', 'num_input_tokens_seen': 9080492, 'train_runtime': '4593', 'train_tokens_per_second': '1977'} +{'loss': '0.4731', 'grad_norm': '1.308', 'learning_rate': '5e-05', 'epoch': '0.1117', 'num_input_tokens_seen': 9082539, 'train_runtime': '4594', 'train_tokens_per_second': '1977'} +{'loss': '0.8816', 'grad_norm': '1.667', 'learning_rate': '5e-05', 'epoch': '0.1117', 'num_input_tokens_seen': 9084586, 'train_runtime': '4595', 'train_tokens_per_second': '1977'} +{'loss': '0.2858', 'grad_norm': '1.089', 'learning_rate': '5e-05', 'epoch': '0.1118', 'num_input_tokens_seen': 9086633, 'train_runtime': '4596', 'train_tokens_per_second': '1977'} +{'loss': '0.3707', 'grad_norm': '1.12', 'learning_rate': '5e-05', 'epoch': '0.1118', 'num_input_tokens_seen': 9088680, 'train_runtime': '4597', 'train_tokens_per_second': '1977'} +{'loss': '0.6133', 'grad_norm': '1.75', 'learning_rate': '5e-05', 'epoch': '0.1118', 'num_input_tokens_seen': 9090727, 'train_runtime': '4598', 'train_tokens_per_second': '1977'} +{'loss': '1.086', 'grad_norm': '1.843', 'learning_rate': '5e-05', 'epoch': '0.1118', 'num_input_tokens_seen': 9092774, 'train_runtime': '4599', 'train_tokens_per_second': '1977'} +{'loss': '0.5632', 'grad_norm': '1.198', 'learning_rate': '5e-05', 'epoch': '0.1119', 'num_input_tokens_seen': 9094821, 'train_runtime': '4600', 'train_tokens_per_second': '1977'} +{'loss': '0.8611', 'grad_norm': '1.561', 'learning_rate': '5e-05', 'epoch': '0.1119', 'num_input_tokens_seen': 9096868, 'train_runtime': '4601', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '2.053', 'learning_rate': '5e-05', 'epoch': '0.1119', 'num_input_tokens_seen': 9098915, 'train_runtime': '4602', 'train_tokens_per_second': '1977'} +{'loss': '2.478', 'grad_norm': '2.62', 'learning_rate': '5e-05', 'epoch': '0.1119', 'num_input_tokens_seen': 9100962, 'train_runtime': '4603', 'train_tokens_per_second': '1977'} +{'loss': '0.5926', 'grad_norm': '1.687', 'learning_rate': '5e-05', 'epoch': '0.112', 'num_input_tokens_seen': 9103009, 'train_runtime': '4604', 'train_tokens_per_second': '1977'} +{'loss': '1.117', 'grad_norm': '1.714', 'learning_rate': '5e-05', 'epoch': '0.112', 'num_input_tokens_seen': 9105056, 'train_runtime': '4605', 'train_tokens_per_second': '1977'} +{'loss': '0.8918', 'grad_norm': '1.688', 'learning_rate': '5e-05', 'epoch': '0.112', 'num_input_tokens_seen': 9107103, 'train_runtime': '4606', 'train_tokens_per_second': '1977'} +{'loss': '0.3094', 'grad_norm': '0.9425', 'learning_rate': '5e-05', 'epoch': '0.112', 'num_input_tokens_seen': 9109150, 'train_runtime': '4607', 'train_tokens_per_second': '1977'} +{'loss': '0.7384', 'grad_norm': '1.507', 'learning_rate': '5e-05', 'epoch': '0.1121', 'num_input_tokens_seen': 9111197, 'train_runtime': '4608', 'train_tokens_per_second': '1977'} +{'loss': '0.9086', 'grad_norm': '1.763', 'learning_rate': '5e-05', 'epoch': '0.1121', 'num_input_tokens_seen': 9113244, 'train_runtime': '4609', 'train_tokens_per_second': '1977'} +{'loss': '0.4864', 'grad_norm': '1.506', 'learning_rate': '5e-05', 'epoch': '0.1121', 'num_input_tokens_seen': 9115291, 'train_runtime': '4610', 'train_tokens_per_second': '1977'} +{'loss': '0.4096', 'grad_norm': '1.439', 'learning_rate': '5e-05', 'epoch': '0.1121', 'num_input_tokens_seen': 9117338, 'train_runtime': '4611', 'train_tokens_per_second': '1977'} +{'loss': '1.448', 'grad_norm': '2.451', 'learning_rate': '5e-05', 'epoch': '0.1122', 'num_input_tokens_seen': 9119385, 'train_runtime': '4612', 'train_tokens_per_second': '1977'} +{'loss': '0.9038', 'grad_norm': '1.467', 'learning_rate': '5e-05', 'epoch': '0.1122', 'num_input_tokens_seen': 9121432, 'train_runtime': '4613', 'train_tokens_per_second': '1977'} +{'loss': '0.6852', 'grad_norm': '1.794', 'learning_rate': '5e-05', 'epoch': '0.1122', 'num_input_tokens_seen': 9123479, 'train_runtime': '4614', 'train_tokens_per_second': '1977'} +{'loss': '0.3121', 'grad_norm': '1.215', 'learning_rate': '5e-05', 'epoch': '0.1122', 'num_input_tokens_seen': 9125526, 'train_runtime': '4615', 'train_tokens_per_second': '1977'} +{'loss': '1.635', 'grad_norm': '2.276', 'learning_rate': '5e-05', 'epoch': '0.1123', 'num_input_tokens_seen': 9127573, 'train_runtime': '4616', 'train_tokens_per_second': '1977'} +{'loss': '0.5855', 'grad_norm': '1.985', 'learning_rate': '5e-05', 'epoch': '0.1123', 'num_input_tokens_seen': 9129620, 'train_runtime': '4617', 'train_tokens_per_second': '1977'} +{'loss': '1.532', 'grad_norm': '2.492', 'learning_rate': '5e-05', 'epoch': '0.1123', 'num_input_tokens_seen': 9131667, 'train_runtime': '4618', 'train_tokens_per_second': '1977'} +{'loss': '0.7918', 'grad_norm': '1.302', 'learning_rate': '5e-05', 'epoch': '0.1123', 'num_input_tokens_seen': 9133714, 'train_runtime': '4620', 'train_tokens_per_second': '1977'} +{'loss': '0.8302', 'grad_norm': '2.106', 'learning_rate': '5e-05', 'epoch': '0.1124', 'num_input_tokens_seen': 9135761, 'train_runtime': '4621', 'train_tokens_per_second': '1977'} +{'loss': '0.9943', 'grad_norm': '1.898', 'learning_rate': '5e-05', 'epoch': '0.1124', 'num_input_tokens_seen': 9137808, 'train_runtime': '4622', 'train_tokens_per_second': '1977'} +{'loss': '0.7764', 'grad_norm': '1.693', 'learning_rate': '5e-05', 'epoch': '0.1124', 'num_input_tokens_seen': 9139855, 'train_runtime': '4623', 'train_tokens_per_second': '1977'} +{'loss': '0.4317', 'grad_norm': '1.29', 'learning_rate': '5e-05', 'epoch': '0.1124', 'num_input_tokens_seen': 9141902, 'train_runtime': '4624', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '2.071', 'learning_rate': '5e-05', 'epoch': '0.1125', 'num_input_tokens_seen': 9143949, 'train_runtime': '4625', 'train_tokens_per_second': '1977'} +{'loss': '1.662', 'grad_norm': '2.304', 'learning_rate': '5e-05', 'epoch': '0.1125', 'num_input_tokens_seen': 9145996, 'train_runtime': '4626', 'train_tokens_per_second': '1977'} +{'loss': '0.7078', 'grad_norm': '1.57', 'learning_rate': '5e-05', 'epoch': '0.1125', 'num_input_tokens_seen': 9148043, 'train_runtime': '4627', 'train_tokens_per_second': '1977'} +{'loss': '1.062', 'grad_norm': '1.816', 'learning_rate': '5e-05', 'epoch': '0.1125', 'num_input_tokens_seen': 9150090, 'train_runtime': '4628', 'train_tokens_per_second': '1977'} +{'loss': '0.8357', 'grad_norm': '2.415', 'learning_rate': '5e-05', 'epoch': '0.1126', 'num_input_tokens_seen': 9152137, 'train_runtime': '4629', 'train_tokens_per_second': '1977'} +{'loss': '1.483', 'grad_norm': '2.524', 'learning_rate': '5e-05', 'epoch': '0.1126', 'num_input_tokens_seen': 9154184, 'train_runtime': '4630', 'train_tokens_per_second': '1977'} +{'loss': '0.4264', 'grad_norm': '1.129', 'learning_rate': '5e-05', 'epoch': '0.1126', 'num_input_tokens_seen': 9156231, 'train_runtime': '4631', 'train_tokens_per_second': '1977'} +{'loss': '0.3842', 'grad_norm': '1.39', 'learning_rate': '5e-05', 'epoch': '0.1126', 'num_input_tokens_seen': 9158278, 'train_runtime': '4632', 'train_tokens_per_second': '1977'} +{'loss': '2.166', 'grad_norm': '2.629', 'learning_rate': '5e-05', 'epoch': '0.1127', 'num_input_tokens_seen': 9160325, 'train_runtime': '4633', 'train_tokens_per_second': '1977'} +{'loss': '0.9756', 'grad_norm': '1.892', 'learning_rate': '5e-05', 'epoch': '0.1127', 'num_input_tokens_seen': 9162372, 'train_runtime': '4634', 'train_tokens_per_second': '1977'} +{'loss': '1.508', 'grad_norm': '2.392', 'learning_rate': '5e-05', 'epoch': '0.1127', 'num_input_tokens_seen': 9164419, 'train_runtime': '4635', 'train_tokens_per_second': '1977'} +{'loss': '1.05', 'grad_norm': '1.779', 'learning_rate': '5e-05', 'epoch': '0.1127', 'num_input_tokens_seen': 9166466, 'train_runtime': '4636', 'train_tokens_per_second': '1977'} +{'loss': '0.8209', 'grad_norm': '1.624', 'learning_rate': '5e-05', 'epoch': '0.1128', 'num_input_tokens_seen': 9168513, 'train_runtime': '4637', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.635', 'learning_rate': '5e-05', 'epoch': '0.1128', 'num_input_tokens_seen': 9170560, 'train_runtime': '4638', 'train_tokens_per_second': '1977'} +{'loss': '0.3898', 'grad_norm': '1.231', 'learning_rate': '5e-05', 'epoch': '0.1128', 'num_input_tokens_seen': 9172607, 'train_runtime': '4639', 'train_tokens_per_second': '1977'} +{'loss': '0.3592', 'grad_norm': '1.198', 'learning_rate': '5e-05', 'epoch': '0.1128', 'num_input_tokens_seen': 9174654, 'train_runtime': '4640', 'train_tokens_per_second': '1977'} +{'loss': '0.5021', 'grad_norm': '1.341', 'learning_rate': '5e-05', 'epoch': '0.1129', 'num_input_tokens_seen': 9176701, 'train_runtime': '4641', 'train_tokens_per_second': '1977'} +{'loss': '1.681', 'grad_norm': '2.555', 'learning_rate': '5e-05', 'epoch': '0.1129', 'num_input_tokens_seen': 9178748, 'train_runtime': '4642', 'train_tokens_per_second': '1977'} +{'loss': '0.793', 'grad_norm': '1.386', 'learning_rate': '5e-05', 'epoch': '0.1129', 'num_input_tokens_seen': 9180795, 'train_runtime': '4643', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.76', 'learning_rate': '5e-05', 'epoch': '0.1129', 'num_input_tokens_seen': 9182842, 'train_runtime': '4644', 'train_tokens_per_second': '1977'} +{'loss': '0.5845', 'grad_norm': '1.288', 'learning_rate': '5e-05', 'epoch': '0.113', 'num_input_tokens_seen': 9184889, 'train_runtime': '4645', 'train_tokens_per_second': '1977'} +{'loss': '1.284', 'grad_norm': '2.123', 'learning_rate': '5e-05', 'epoch': '0.113', 'num_input_tokens_seen': 9186936, 'train_runtime': '4646', 'train_tokens_per_second': '1977'} +{'loss': '0.3073', 'grad_norm': '1.042', 'learning_rate': '5e-05', 'epoch': '0.113', 'num_input_tokens_seen': 9188983, 'train_runtime': '4647', 'train_tokens_per_second': '1977'} +{'loss': '1.469', 'grad_norm': '2.419', 'learning_rate': '5e-05', 'epoch': '0.113', 'num_input_tokens_seen': 9191030, 'train_runtime': '4648', 'train_tokens_per_second': '1977'} +{'loss': '1.877', 'grad_norm': '2.637', 'learning_rate': '5e-05', 'epoch': '0.1131', 'num_input_tokens_seen': 9193077, 'train_runtime': '4650', 'train_tokens_per_second': '1977'} +{'loss': '0.3665', 'grad_norm': '1.501', 'learning_rate': '5e-05', 'epoch': '0.1131', 'num_input_tokens_seen': 9195124, 'train_runtime': '4651', 'train_tokens_per_second': '1977'} +{'loss': '0.2883', 'grad_norm': '1.218', 'learning_rate': '5e-05', 'epoch': '0.1131', 'num_input_tokens_seen': 9197171, 'train_runtime': '4652', 'train_tokens_per_second': '1977'} +{'loss': '1.544', 'grad_norm': '2.286', 'learning_rate': '5e-05', 'epoch': '0.1132', 'num_input_tokens_seen': 9199218, 'train_runtime': '4653', 'train_tokens_per_second': '1977'} +{'loss': '0.7596', 'grad_norm': '1.571', 'learning_rate': '5e-05', 'epoch': '0.1132', 'num_input_tokens_seen': 9201265, 'train_runtime': '4654', 'train_tokens_per_second': '1977'} +{'loss': '0.7759', 'grad_norm': '1.635', 'learning_rate': '5e-05', 'epoch': '0.1132', 'num_input_tokens_seen': 9203312, 'train_runtime': '4655', 'train_tokens_per_second': '1977'} +{'loss': '1.913', 'grad_norm': '2.238', 'learning_rate': '5e-05', 'epoch': '0.1132', 'num_input_tokens_seen': 9205359, 'train_runtime': '4656', 'train_tokens_per_second': '1977'} +{'loss': '0.3924', 'grad_norm': '1.524', 'learning_rate': '5e-05', 'epoch': '0.1133', 'num_input_tokens_seen': 9207406, 'train_runtime': '4657', 'train_tokens_per_second': '1977'} +{'loss': '0.9883', 'grad_norm': '2.148', 'learning_rate': '5e-05', 'epoch': '0.1133', 'num_input_tokens_seen': 9209453, 'train_runtime': '4658', 'train_tokens_per_second': '1977'} +{'loss': '0.4996', 'grad_norm': '1.269', 'learning_rate': '5e-05', 'epoch': '0.1133', 'num_input_tokens_seen': 9211500, 'train_runtime': '4659', 'train_tokens_per_second': '1977'} +{'loss': '0.821', 'grad_norm': '1.959', 'learning_rate': '5e-05', 'epoch': '0.1133', 'num_input_tokens_seen': 9213547, 'train_runtime': '4660', 'train_tokens_per_second': '1977'} +{'loss': '0.4219', 'grad_norm': '1.538', 'learning_rate': '5e-05', 'epoch': '0.1134', 'num_input_tokens_seen': 9215594, 'train_runtime': '4661', 'train_tokens_per_second': '1977'} +{'loss': '1.124', 'grad_norm': '1.814', 'learning_rate': '5e-05', 'epoch': '0.1134', 'num_input_tokens_seen': 9217641, 'train_runtime': '4662', 'train_tokens_per_second': '1977'} +{'loss': '0.3636', 'grad_norm': '1.088', 'learning_rate': '5e-05', 'epoch': '0.1134', 'num_input_tokens_seen': 9219688, 'train_runtime': '4663', 'train_tokens_per_second': '1977'} +{'loss': '0.6401', 'grad_norm': '1.623', 'learning_rate': '5e-05', 'epoch': '0.1134', 'num_input_tokens_seen': 9221735, 'train_runtime': '4664', 'train_tokens_per_second': '1977'} +{'loss': '0.9653', 'grad_norm': '1.726', 'learning_rate': '5e-05', 'epoch': '0.1135', 'num_input_tokens_seen': 9223782, 'train_runtime': '4665', 'train_tokens_per_second': '1977'} +{'loss': '1.337', 'grad_norm': '2.107', 'learning_rate': '5e-05', 'epoch': '0.1135', 'num_input_tokens_seen': 9225829, 'train_runtime': '4666', 'train_tokens_per_second': '1977'} +{'loss': '0.4974', 'grad_norm': '1.328', 'learning_rate': '5e-05', 'epoch': '0.1135', 'num_input_tokens_seen': 9227876, 'train_runtime': '4667', 'train_tokens_per_second': '1977'} +{'loss': '0.8226', 'grad_norm': '1.692', 'learning_rate': '5e-05', 'epoch': '0.1135', 'num_input_tokens_seen': 9229923, 'train_runtime': '4668', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.707', 'learning_rate': '5e-05', 'epoch': '0.1136', 'num_input_tokens_seen': 9231970, 'train_runtime': '4669', 'train_tokens_per_second': '1977'} +{'loss': '0.7327', 'grad_norm': '1.542', 'learning_rate': '5e-05', 'epoch': '0.1136', 'num_input_tokens_seen': 9234017, 'train_runtime': '4670', 'train_tokens_per_second': '1977'} +{'loss': '0.4036', 'grad_norm': '1.278', 'learning_rate': '5e-05', 'epoch': '0.1136', 'num_input_tokens_seen': 9236064, 'train_runtime': '4671', 'train_tokens_per_second': '1977'} +{'loss': '1.266', 'grad_norm': '2.558', 'learning_rate': '5e-05', 'epoch': '0.1136', 'num_input_tokens_seen': 9238111, 'train_runtime': '4672', 'train_tokens_per_second': '1977'} +{'loss': '1.728', 'grad_norm': '2.229', 'learning_rate': '5e-05', 'epoch': '0.1137', 'num_input_tokens_seen': 9240158, 'train_runtime': '4673', 'train_tokens_per_second': '1977'} +{'loss': '0.3624', 'grad_norm': '1.188', 'learning_rate': '5e-05', 'epoch': '0.1137', 'num_input_tokens_seen': 9242205, 'train_runtime': '4674', 'train_tokens_per_second': '1977'} +{'loss': '1.739', 'grad_norm': '2.665', 'learning_rate': '5e-05', 'epoch': '0.1137', 'num_input_tokens_seen': 9244252, 'train_runtime': '4675', 'train_tokens_per_second': '1977'} +{'loss': '2.067', 'grad_norm': '2.297', 'learning_rate': '5e-05', 'epoch': '0.1137', 'num_input_tokens_seen': 9246299, 'train_runtime': '4676', 'train_tokens_per_second': '1977'} +{'loss': '0.8418', 'grad_norm': '1.63', 'learning_rate': '5e-05', 'epoch': '0.1138', 'num_input_tokens_seen': 9248346, 'train_runtime': '4677', 'train_tokens_per_second': '1977'} +{'loss': '1.153', 'grad_norm': '1.819', 'learning_rate': '5e-05', 'epoch': '0.1138', 'num_input_tokens_seen': 9250393, 'train_runtime': '4679', 'train_tokens_per_second': '1977'} +{'loss': '0.6026', 'grad_norm': '1.519', 'learning_rate': '5e-05', 'epoch': '0.1138', 'num_input_tokens_seen': 9252440, 'train_runtime': '4680', 'train_tokens_per_second': '1977'} +{'loss': '0.6241', 'grad_norm': '1.605', 'learning_rate': '5e-05', 'epoch': '0.1138', 'num_input_tokens_seen': 9254487, 'train_runtime': '4681', 'train_tokens_per_second': '1977'} +{'loss': '1.205', 'grad_norm': '2.219', 'learning_rate': '5e-05', 'epoch': '0.1139', 'num_input_tokens_seen': 9256534, 'train_runtime': '4682', 'train_tokens_per_second': '1977'} +{'loss': '0.3174', 'grad_norm': '1.305', 'learning_rate': '5e-05', 'epoch': '0.1139', 'num_input_tokens_seen': 9258581, 'train_runtime': '4683', 'train_tokens_per_second': '1977'} +{'loss': '1.202', 'grad_norm': '2.029', 'learning_rate': '5e-05', 'epoch': '0.1139', 'num_input_tokens_seen': 9260628, 'train_runtime': '4684', 'train_tokens_per_second': '1977'} +{'loss': '0.4311', 'grad_norm': '1.132', 'learning_rate': '5e-05', 'epoch': '0.1139', 'num_input_tokens_seen': 9262675, 'train_runtime': '4685', 'train_tokens_per_second': '1977'} +{'loss': '1.76', 'grad_norm': '2.282', 'learning_rate': '5e-05', 'epoch': '0.114', 'num_input_tokens_seen': 9264722, 'train_runtime': '4686', 'train_tokens_per_second': '1977'} +{'loss': '0.8297', 'grad_norm': '1.716', 'learning_rate': '5e-05', 'epoch': '0.114', 'num_input_tokens_seen': 9266769, 'train_runtime': '4687', 'train_tokens_per_second': '1977'} +{'loss': '2.45', 'grad_norm': '3.453', 'learning_rate': '5e-05', 'epoch': '0.114', 'num_input_tokens_seen': 9268816, 'train_runtime': '4688', 'train_tokens_per_second': '1977'} +{'loss': '0.4346', 'grad_norm': '1.045', 'learning_rate': '5e-05', 'epoch': '0.114', 'num_input_tokens_seen': 9270863, 'train_runtime': '4689', 'train_tokens_per_second': '1977'} +{'loss': '0.8581', 'grad_norm': '1.871', 'learning_rate': '5e-05', 'epoch': '0.1141', 'num_input_tokens_seen': 9272910, 'train_runtime': '4690', 'train_tokens_per_second': '1977'} +{'loss': '2.535', 'grad_norm': '3.141', 'learning_rate': '5e-05', 'epoch': '0.1141', 'num_input_tokens_seen': 9274957, 'train_runtime': '4691', 'train_tokens_per_second': '1977'} +{'loss': '0.4273', 'grad_norm': '0.9734', 'learning_rate': '5e-05', 'epoch': '0.1141', 'num_input_tokens_seen': 9277004, 'train_runtime': '4692', 'train_tokens_per_second': '1977'} +{'loss': '0.3849', 'grad_norm': '1.214', 'learning_rate': '5e-05', 'epoch': '0.1141', 'num_input_tokens_seen': 9279051, 'train_runtime': '4693', 'train_tokens_per_second': '1977'} +{'loss': '2.287', 'grad_norm': '2.193', 'learning_rate': '5e-05', 'epoch': '0.1142', 'num_input_tokens_seen': 9281098, 'train_runtime': '4694', 'train_tokens_per_second': '1977'} +{'loss': '0.5828', 'grad_norm': '1.227', 'learning_rate': '5e-05', 'epoch': '0.1142', 'num_input_tokens_seen': 9283145, 'train_runtime': '4695', 'train_tokens_per_second': '1977'} +{'loss': '1.563', 'grad_norm': '2.341', 'learning_rate': '5e-05', 'epoch': '0.1142', 'num_input_tokens_seen': 9285192, 'train_runtime': '4696', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '1.911', 'learning_rate': '5e-05', 'epoch': '0.1142', 'num_input_tokens_seen': 9287239, 'train_runtime': '4697', 'train_tokens_per_second': '1977'} +{'loss': '2.515', 'grad_norm': '2.982', 'learning_rate': '5e-05', 'epoch': '0.1143', 'num_input_tokens_seen': 9289286, 'train_runtime': '4698', 'train_tokens_per_second': '1977'} +{'loss': '0.9626', 'grad_norm': '1.822', 'learning_rate': '5e-05', 'epoch': '0.1143', 'num_input_tokens_seen': 9291333, 'train_runtime': '4699', 'train_tokens_per_second': '1977'} +{'loss': '0.4327', 'grad_norm': '1.19', 'learning_rate': '5e-05', 'epoch': '0.1143', 'num_input_tokens_seen': 9293380, 'train_runtime': '4700', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.822', 'learning_rate': '5e-05', 'epoch': '0.1143', 'num_input_tokens_seen': 9295427, 'train_runtime': '4701', 'train_tokens_per_second': '1977'} +{'loss': '1.241', 'grad_norm': '1.715', 'learning_rate': '5e-05', 'epoch': '0.1144', 'num_input_tokens_seen': 9297474, 'train_runtime': '4702', 'train_tokens_per_second': '1977'} +{'loss': '0.4145', 'grad_norm': '1.208', 'learning_rate': '5e-05', 'epoch': '0.1144', 'num_input_tokens_seen': 9299521, 'train_runtime': '4703', 'train_tokens_per_second': '1977'} +{'loss': '2.326', 'grad_norm': '2.657', 'learning_rate': '5e-05', 'epoch': '0.1144', 'num_input_tokens_seen': 9301568, 'train_runtime': '4704', 'train_tokens_per_second': '1977'} +{'loss': '1.371', 'grad_norm': '2.191', 'learning_rate': '5e-05', 'epoch': '0.1144', 'num_input_tokens_seen': 9303615, 'train_runtime': '4705', 'train_tokens_per_second': '1977'} +{'loss': '0.3409', 'grad_norm': '1.337', 'learning_rate': '5e-05', 'epoch': '0.1145', 'num_input_tokens_seen': 9305662, 'train_runtime': '4706', 'train_tokens_per_second': '1977'} +{'loss': '1.268', 'grad_norm': '2.182', 'learning_rate': '5e-05', 'epoch': '0.1145', 'num_input_tokens_seen': 9307709, 'train_runtime': '4707', 'train_tokens_per_second': '1977'} +{'loss': '1.103', 'grad_norm': '1.747', 'learning_rate': '5e-05', 'epoch': '0.1145', 'num_input_tokens_seen': 9309756, 'train_runtime': '4709', 'train_tokens_per_second': '1977'} +{'loss': '0.9122', 'grad_norm': '2.219', 'learning_rate': '5e-05', 'epoch': '0.1145', 'num_input_tokens_seen': 9311803, 'train_runtime': '4710', 'train_tokens_per_second': '1977'} +{'loss': '0.3436', 'grad_norm': '1.171', 'learning_rate': '5e-05', 'epoch': '0.1146', 'num_input_tokens_seen': 9313850, 'train_runtime': '4711', 'train_tokens_per_second': '1977'} +{'loss': '0.3839', 'grad_norm': '1.018', 'learning_rate': '5e-05', 'epoch': '0.1146', 'num_input_tokens_seen': 9315897, 'train_runtime': '4712', 'train_tokens_per_second': '1977'} +{'loss': '1.211', 'grad_norm': '2.031', 'learning_rate': '5e-05', 'epoch': '0.1146', 'num_input_tokens_seen': 9317944, 'train_runtime': '4713', 'train_tokens_per_second': '1977'} +{'loss': '0.6288', 'grad_norm': '1.401', 'learning_rate': '5e-05', 'epoch': '0.1146', 'num_input_tokens_seen': 9319991, 'train_runtime': '4714', 'train_tokens_per_second': '1977'} +{'loss': '0.8551', 'grad_norm': '1.595', 'learning_rate': '5e-05', 'epoch': '0.1147', 'num_input_tokens_seen': 9322038, 'train_runtime': '4715', 'train_tokens_per_second': '1977'} +{'loss': '1.696', 'grad_norm': '2.536', 'learning_rate': '5e-05', 'epoch': '0.1147', 'num_input_tokens_seen': 9324085, 'train_runtime': '4716', 'train_tokens_per_second': '1977'} +{'loss': '1.741', 'grad_norm': '2.187', 'learning_rate': '5e-05', 'epoch': '0.1147', 'num_input_tokens_seen': 9326132, 'train_runtime': '4717', 'train_tokens_per_second': '1977'} +{'loss': '0.6402', 'grad_norm': '1.79', 'learning_rate': '5e-05', 'epoch': '0.1147', 'num_input_tokens_seen': 9328179, 'train_runtime': '4718', 'train_tokens_per_second': '1977'} +{'loss': '0.5649', 'grad_norm': '1.536', 'learning_rate': '5e-05', 'epoch': '0.1148', 'num_input_tokens_seen': 9330226, 'train_runtime': '4719', 'train_tokens_per_second': '1977'} +{'loss': '0.6578', 'grad_norm': '1.545', 'learning_rate': '5e-05', 'epoch': '0.1148', 'num_input_tokens_seen': 9332273, 'train_runtime': '4720', 'train_tokens_per_second': '1977'} +{'loss': '1.359', 'grad_norm': '2.615', 'learning_rate': '5e-05', 'epoch': '0.1148', 'num_input_tokens_seen': 9334320, 'train_runtime': '4721', 'train_tokens_per_second': '1977'} +{'loss': '0.6198', 'grad_norm': '1.466', 'learning_rate': '5e-05', 'epoch': '0.1148', 'num_input_tokens_seen': 9336367, 'train_runtime': '4722', 'train_tokens_per_second': '1977'} +{'loss': '0.4211', 'grad_norm': '1.246', 'learning_rate': '5e-05', 'epoch': '0.1149', 'num_input_tokens_seen': 9338414, 'train_runtime': '4723', 'train_tokens_per_second': '1977'} +{'loss': '1.45', 'grad_norm': '2.427', 'learning_rate': '5e-05', 'epoch': '0.1149', 'num_input_tokens_seen': 9340461, 'train_runtime': '4724', 'train_tokens_per_second': '1977'} +{'loss': '0.578', 'grad_norm': '1.546', 'learning_rate': '5e-05', 'epoch': '0.1149', 'num_input_tokens_seen': 9342508, 'train_runtime': '4725', 'train_tokens_per_second': '1977'} +{'loss': '1.472', 'grad_norm': '2.091', 'learning_rate': '5e-05', 'epoch': '0.1149', 'num_input_tokens_seen': 9344555, 'train_runtime': '4726', 'train_tokens_per_second': '1977'} +{'loss': '1.536', 'grad_norm': '2.256', 'learning_rate': '5e-05', 'epoch': '0.115', 'num_input_tokens_seen': 9346602, 'train_runtime': '4727', 'train_tokens_per_second': '1977'} +{'loss': '0.9012', 'grad_norm': '1.238', 'learning_rate': '5e-05', 'epoch': '0.115', 'num_input_tokens_seen': 9348649, 'train_runtime': '4728', 'train_tokens_per_second': '1977'} +{'loss': '0.5083', 'grad_norm': '1.384', 'learning_rate': '5e-05', 'epoch': '0.115', 'num_input_tokens_seen': 9350696, 'train_runtime': '4729', 'train_tokens_per_second': '1977'} +{'loss': '0.4064', 'grad_norm': '0.8639', 'learning_rate': '5e-05', 'epoch': '0.115', 'num_input_tokens_seen': 9352743, 'train_runtime': '4730', 'train_tokens_per_second': '1977'} +{'loss': '1.817', 'grad_norm': '2.244', 'learning_rate': '5e-05', 'epoch': '0.1151', 'num_input_tokens_seen': 9354790, 'train_runtime': '4731', 'train_tokens_per_second': '1977'} +{'loss': '0.3492', 'grad_norm': '1.223', 'learning_rate': '5e-05', 'epoch': '0.1151', 'num_input_tokens_seen': 9356837, 'train_runtime': '4732', 'train_tokens_per_second': '1977'} +{'loss': '1.233', 'grad_norm': '2.553', 'learning_rate': '5e-05', 'epoch': '0.1151', 'num_input_tokens_seen': 9358884, 'train_runtime': '4733', 'train_tokens_per_second': '1977'} +{'loss': '0.5126', 'grad_norm': '1.336', 'learning_rate': '5e-05', 'epoch': '0.1151', 'num_input_tokens_seen': 9360931, 'train_runtime': '4734', 'train_tokens_per_second': '1977'} +{'loss': '0.4114', 'grad_norm': '1.162', 'learning_rate': '5e-05', 'epoch': '0.1152', 'num_input_tokens_seen': 9362978, 'train_runtime': '4735', 'train_tokens_per_second': '1977'} +{'loss': '0.8162', 'grad_norm': '1.728', 'learning_rate': '5e-05', 'epoch': '0.1152', 'num_input_tokens_seen': 9365025, 'train_runtime': '4737', 'train_tokens_per_second': '1977'} +{'loss': '0.7289', 'grad_norm': '1.228', 'learning_rate': '5e-05', 'epoch': '0.1152', 'num_input_tokens_seen': 9367072, 'train_runtime': '4738', 'train_tokens_per_second': '1977'} +{'loss': '1.104', 'grad_norm': '2.175', 'learning_rate': '5e-05', 'epoch': '0.1152', 'num_input_tokens_seen': 9369119, 'train_runtime': '4739', 'train_tokens_per_second': '1977'} +{'loss': '1.554', 'grad_norm': '2.257', 'learning_rate': '5e-05', 'epoch': '0.1153', 'num_input_tokens_seen': 9371166, 'train_runtime': '4740', 'train_tokens_per_second': '1977'} +{'loss': '0.7016', 'grad_norm': '1.798', 'learning_rate': '5e-05', 'epoch': '0.1153', 'num_input_tokens_seen': 9373213, 'train_runtime': '4741', 'train_tokens_per_second': '1977'} +{'loss': '0.58', 'grad_norm': '1.032', 'learning_rate': '5e-05', 'epoch': '0.1153', 'num_input_tokens_seen': 9375260, 'train_runtime': '4742', 'train_tokens_per_second': '1977'} +{'loss': '2.07', 'grad_norm': '2.347', 'learning_rate': '5e-05', 'epoch': '0.1153', 'num_input_tokens_seen': 9377307, 'train_runtime': '4743', 'train_tokens_per_second': '1977'} +{'loss': '0.2916', 'grad_norm': '1.122', 'learning_rate': '5e-05', 'epoch': '0.1154', 'num_input_tokens_seen': 9379354, 'train_runtime': '4744', 'train_tokens_per_second': '1977'} +{'loss': '0.4869', 'grad_norm': '1.359', 'learning_rate': '5e-05', 'epoch': '0.1154', 'num_input_tokens_seen': 9381401, 'train_runtime': '4745', 'train_tokens_per_second': '1977'} +{'loss': '0.4647', 'grad_norm': '1.467', 'learning_rate': '5e-05', 'epoch': '0.1154', 'num_input_tokens_seen': 9383448, 'train_runtime': '4746', 'train_tokens_per_second': '1977'} +{'loss': '0.5567', 'grad_norm': '1.539', 'learning_rate': '5e-05', 'epoch': '0.1154', 'num_input_tokens_seen': 9385495, 'train_runtime': '4747', 'train_tokens_per_second': '1977'} +{'loss': '0.497', 'grad_norm': '1.211', 'learning_rate': '5e-05', 'epoch': '0.1155', 'num_input_tokens_seen': 9387542, 'train_runtime': '4748', 'train_tokens_per_second': '1977'} +{'loss': '1.414', 'grad_norm': '2.001', 'learning_rate': '5e-05', 'epoch': '0.1155', 'num_input_tokens_seen': 9389589, 'train_runtime': '4749', 'train_tokens_per_second': '1977'} +{'loss': '1.506', 'grad_norm': '2.683', 'learning_rate': '5e-05', 'epoch': '0.1155', 'num_input_tokens_seen': 9391636, 'train_runtime': '4750', 'train_tokens_per_second': '1977'} +{'loss': '0.8164', 'grad_norm': '1.382', 'learning_rate': '5e-05', 'epoch': '0.1155', 'num_input_tokens_seen': 9393683, 'train_runtime': '4751', 'train_tokens_per_second': '1977'} +{'loss': '0.2545', 'grad_norm': '1.08', 'learning_rate': '5e-05', 'epoch': '0.1156', 'num_input_tokens_seen': 9395730, 'train_runtime': '4752', 'train_tokens_per_second': '1977'} +{'loss': '0.8257', 'grad_norm': '1.951', 'learning_rate': '5e-05', 'epoch': '0.1156', 'num_input_tokens_seen': 9397777, 'train_runtime': '4753', 'train_tokens_per_second': '1977'} +{'loss': '0.7153', 'grad_norm': '2.061', 'learning_rate': '5e-05', 'epoch': '0.1156', 'num_input_tokens_seen': 9399824, 'train_runtime': '4754', 'train_tokens_per_second': '1977'} +{'loss': '1.093', 'grad_norm': '1.838', 'learning_rate': '5e-05', 'epoch': '0.1156', 'num_input_tokens_seen': 9401871, 'train_runtime': '4755', 'train_tokens_per_second': '1977'} +{'loss': '1.316', 'grad_norm': '2.138', 'learning_rate': '5e-05', 'epoch': '0.1157', 'num_input_tokens_seen': 9403918, 'train_runtime': '4756', 'train_tokens_per_second': '1977'} +{'loss': '1.028', 'grad_norm': '2.083', 'learning_rate': '5e-05', 'epoch': '0.1157', 'num_input_tokens_seen': 9405965, 'train_runtime': '4757', 'train_tokens_per_second': '1977'} +{'loss': '0.4295', 'grad_norm': '1.655', 'learning_rate': '5e-05', 'epoch': '0.1157', 'num_input_tokens_seen': 9408012, 'train_runtime': '4758', 'train_tokens_per_second': '1977'} +{'loss': '0.6059', 'grad_norm': '1.341', 'learning_rate': '5e-05', 'epoch': '0.1157', 'num_input_tokens_seen': 9410059, 'train_runtime': '4759', 'train_tokens_per_second': '1977'} +{'loss': '0.6698', 'grad_norm': '1.247', 'learning_rate': '5e-05', 'epoch': '0.1158', 'num_input_tokens_seen': 9412106, 'train_runtime': '4760', 'train_tokens_per_second': '1977'} +{'loss': '1.122', 'grad_norm': '1.955', 'learning_rate': '5e-05', 'epoch': '0.1158', 'num_input_tokens_seen': 9414153, 'train_runtime': '4761', 'train_tokens_per_second': '1977'} +{'loss': '0.3979', 'grad_norm': '1.628', 'learning_rate': '5e-05', 'epoch': '0.1158', 'num_input_tokens_seen': 9416200, 'train_runtime': '4762', 'train_tokens_per_second': '1977'} +{'loss': '0.4072', 'grad_norm': '1.44', 'learning_rate': '5e-05', 'epoch': '0.1158', 'num_input_tokens_seen': 9418247, 'train_runtime': '4763', 'train_tokens_per_second': '1977'} +{'loss': '1.25', 'grad_norm': '2.103', 'learning_rate': '5e-05', 'epoch': '0.1159', 'num_input_tokens_seen': 9420294, 'train_runtime': '4764', 'train_tokens_per_second': '1977'} +{'loss': '1.756', 'grad_norm': '2.556', 'learning_rate': '5e-05', 'epoch': '0.1159', 'num_input_tokens_seen': 9422341, 'train_runtime': '4766', 'train_tokens_per_second': '1977'} +{'loss': '0.6825', 'grad_norm': '1.607', 'learning_rate': '5e-05', 'epoch': '0.1159', 'num_input_tokens_seen': 9424388, 'train_runtime': '4767', 'train_tokens_per_second': '1977'} +{'loss': '0.447', 'grad_norm': '1.266', 'learning_rate': '5e-05', 'epoch': '0.1159', 'num_input_tokens_seen': 9426435, 'train_runtime': '4768', 'train_tokens_per_second': '1977'} +{'loss': '1.468', 'grad_norm': '2.192', 'learning_rate': '5e-05', 'epoch': '0.116', 'num_input_tokens_seen': 9428482, 'train_runtime': '4769', 'train_tokens_per_second': '1977'} +{'loss': '0.343', 'grad_norm': '1.125', 'learning_rate': '5e-05', 'epoch': '0.116', 'num_input_tokens_seen': 9430529, 'train_runtime': '4770', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.111', 'learning_rate': '5e-05', 'epoch': '0.116', 'num_input_tokens_seen': 9432576, 'train_runtime': '4771', 'train_tokens_per_second': '1977'} +{'loss': '1.132', 'grad_norm': '1.894', 'learning_rate': '5e-05', 'epoch': '0.116', 'num_input_tokens_seen': 9434623, 'train_runtime': '4772', 'train_tokens_per_second': '1977'} +{'loss': '0.6334', 'grad_norm': '1.603', 'learning_rate': '5e-05', 'epoch': '0.1161', 'num_input_tokens_seen': 9436670, 'train_runtime': '4773', 'train_tokens_per_second': '1977'} +{'loss': '2.405', 'grad_norm': '2.779', 'learning_rate': '5e-05', 'epoch': '0.1161', 'num_input_tokens_seen': 9438717, 'train_runtime': '4774', 'train_tokens_per_second': '1977'} +{'loss': '0.7055', 'grad_norm': '2.002', 'learning_rate': '5e-05', 'epoch': '0.1161', 'num_input_tokens_seen': 9440764, 'train_runtime': '4775', 'train_tokens_per_second': '1977'} +{'loss': '0.5164', 'grad_norm': '1.257', 'learning_rate': '5e-05', 'epoch': '0.1161', 'num_input_tokens_seen': 9442811, 'train_runtime': '4776', 'train_tokens_per_second': '1977'} +{'loss': '0.8895', 'grad_norm': '1.937', 'learning_rate': '5e-05', 'epoch': '0.1162', 'num_input_tokens_seen': 9444858, 'train_runtime': '4777', 'train_tokens_per_second': '1977'} +{'loss': '0.5287', 'grad_norm': '1.205', 'learning_rate': '5e-05', 'epoch': '0.1162', 'num_input_tokens_seen': 9446905, 'train_runtime': '4778', 'train_tokens_per_second': '1977'} +{'loss': '1.683', 'grad_norm': '1.931', 'learning_rate': '5e-05', 'epoch': '0.1162', 'num_input_tokens_seen': 9448952, 'train_runtime': '4779', 'train_tokens_per_second': '1977'} +{'loss': '1.766', 'grad_norm': '2.017', 'learning_rate': '5e-05', 'epoch': '0.1162', 'num_input_tokens_seen': 9450999, 'train_runtime': '4780', 'train_tokens_per_second': '1977'} +{'loss': '0.8436', 'grad_norm': '1.643', 'learning_rate': '5e-05', 'epoch': '0.1163', 'num_input_tokens_seen': 9453046, 'train_runtime': '4781', 'train_tokens_per_second': '1977'} +{'loss': '1.233', 'grad_norm': '1.881', 'learning_rate': '5e-05', 'epoch': '0.1163', 'num_input_tokens_seen': 9455093, 'train_runtime': '4782', 'train_tokens_per_second': '1977'} +{'loss': '1.664', 'grad_norm': '2.065', 'learning_rate': '5e-05', 'epoch': '0.1163', 'num_input_tokens_seen': 9457140, 'train_runtime': '4783', 'train_tokens_per_second': '1977'} +{'loss': '0.6764', 'grad_norm': '1.627', 'learning_rate': '5e-05', 'epoch': '0.1163', 'num_input_tokens_seen': 9459187, 'train_runtime': '4784', 'train_tokens_per_second': '1977'} +{'loss': '0.6238', 'grad_norm': '1.641', 'learning_rate': '5e-05', 'epoch': '0.1164', 'num_input_tokens_seen': 9461234, 'train_runtime': '4785', 'train_tokens_per_second': '1977'} +{'loss': '0.8301', 'grad_norm': '1.287', 'learning_rate': '5e-05', 'epoch': '0.1164', 'num_input_tokens_seen': 9463281, 'train_runtime': '4786', 'train_tokens_per_second': '1977'} +{'loss': '0.6929', 'grad_norm': '1.697', 'learning_rate': '5e-05', 'epoch': '0.1164', 'num_input_tokens_seen': 9465328, 'train_runtime': '4787', 'train_tokens_per_second': '1977'} +{'loss': '0.5775', 'grad_norm': '1.481', 'learning_rate': '5e-05', 'epoch': '0.1164', 'num_input_tokens_seen': 9467375, 'train_runtime': '4788', 'train_tokens_per_second': '1977'} +{'loss': '1.982', 'grad_norm': '3.162', 'learning_rate': '5e-05', 'epoch': '0.1165', 'num_input_tokens_seen': 9469422, 'train_runtime': '4789', 'train_tokens_per_second': '1977'} +{'loss': '0.4419', 'grad_norm': '1.322', 'learning_rate': '5e-05', 'epoch': '0.1165', 'num_input_tokens_seen': 9471469, 'train_runtime': '4790', 'train_tokens_per_second': '1977'} +{'loss': '0.6476', 'grad_norm': '1.512', 'learning_rate': '5e-05', 'epoch': '0.1165', 'num_input_tokens_seen': 9473516, 'train_runtime': '4791', 'train_tokens_per_second': '1977'} +{'loss': '0.4782', 'grad_norm': '1.005', 'learning_rate': '5e-05', 'epoch': '0.1165', 'num_input_tokens_seen': 9475563, 'train_runtime': '4792', 'train_tokens_per_second': '1977'} +{'loss': '0.3137', 'grad_norm': '1.46', 'learning_rate': '5e-05', 'epoch': '0.1166', 'num_input_tokens_seen': 9477610, 'train_runtime': '4793', 'train_tokens_per_second': '1977'} +{'loss': '0.9326', 'grad_norm': '1.523', 'learning_rate': '5e-05', 'epoch': '0.1166', 'num_input_tokens_seen': 9479657, 'train_runtime': '4794', 'train_tokens_per_second': '1977'} +{'loss': '0.6816', 'grad_norm': '1.705', 'learning_rate': '5e-05', 'epoch': '0.1166', 'num_input_tokens_seen': 9481704, 'train_runtime': '4796', 'train_tokens_per_second': '1977'} +{'loss': '0.6681', 'grad_norm': '1.598', 'learning_rate': '5e-05', 'epoch': '0.1167', 'num_input_tokens_seen': 9483751, 'train_runtime': '4797', 'train_tokens_per_second': '1977'} +{'loss': '0.6216', 'grad_norm': '1.662', 'learning_rate': '5e-05', 'epoch': '0.1167', 'num_input_tokens_seen': 9485798, 'train_runtime': '4798', 'train_tokens_per_second': '1977'} +{'loss': '0.5391', 'grad_norm': '1.322', 'learning_rate': '5e-05', 'epoch': '0.1167', 'num_input_tokens_seen': 9487845, 'train_runtime': '4799', 'train_tokens_per_second': '1977'} +{'loss': '0.7309', 'grad_norm': '2.094', 'learning_rate': '5e-05', 'epoch': '0.1167', 'num_input_tokens_seen': 9489892, 'train_runtime': '4800', 'train_tokens_per_second': '1977'} +{'loss': '0.6264', 'grad_norm': '1.481', 'learning_rate': '5e-05', 'epoch': '0.1168', 'num_input_tokens_seen': 9491939, 'train_runtime': '4801', 'train_tokens_per_second': '1977'} +{'loss': '0.7477', 'grad_norm': '1.68', 'learning_rate': '5e-05', 'epoch': '0.1168', 'num_input_tokens_seen': 9493986, 'train_runtime': '4802', 'train_tokens_per_second': '1977'} +{'loss': '0.5727', 'grad_norm': '1.426', 'learning_rate': '5e-05', 'epoch': '0.1168', 'num_input_tokens_seen': 9496033, 'train_runtime': '4803', 'train_tokens_per_second': '1977'} +{'loss': '0.5815', 'grad_norm': '1.459', 'learning_rate': '5e-05', 'epoch': '0.1168', 'num_input_tokens_seen': 9498080, 'train_runtime': '4804', 'train_tokens_per_second': '1977'} +{'loss': '0.2961', 'grad_norm': '1.181', 'learning_rate': '5e-05', 'epoch': '0.1169', 'num_input_tokens_seen': 9500127, 'train_runtime': '4805', 'train_tokens_per_second': '1977'} +{'loss': '0.4886', 'grad_norm': '1.062', 'learning_rate': '5e-05', 'epoch': '0.1169', 'num_input_tokens_seen': 9502174, 'train_runtime': '4806', 'train_tokens_per_second': '1977'} +{'loss': '1.942', 'grad_norm': '2.46', 'learning_rate': '5e-05', 'epoch': '0.1169', 'num_input_tokens_seen': 9504221, 'train_runtime': '4807', 'train_tokens_per_second': '1977'} +{'loss': '0.6847', 'grad_norm': '1.28', 'learning_rate': '5e-05', 'epoch': '0.1169', 'num_input_tokens_seen': 9506268, 'train_runtime': '4808', 'train_tokens_per_second': '1977'} +{'loss': '2.211', 'grad_norm': '2.755', 'learning_rate': '5e-05', 'epoch': '0.117', 'num_input_tokens_seen': 9508315, 'train_runtime': '4809', 'train_tokens_per_second': '1977'} +{'loss': '0.6712', 'grad_norm': '1.361', 'learning_rate': '5e-05', 'epoch': '0.117', 'num_input_tokens_seen': 9510362, 'train_runtime': '4810', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '2.097', 'learning_rate': '5e-05', 'epoch': '0.117', 'num_input_tokens_seen': 9512409, 'train_runtime': '4811', 'train_tokens_per_second': '1977'} +{'loss': '0.4095', 'grad_norm': '1.175', 'learning_rate': '5e-05', 'epoch': '0.117', 'num_input_tokens_seen': 9514456, 'train_runtime': '4812', 'train_tokens_per_second': '1977'} +{'loss': '0.5505', 'grad_norm': '1.766', 'learning_rate': '5e-05', 'epoch': '0.1171', 'num_input_tokens_seen': 9516503, 'train_runtime': '4813', 'train_tokens_per_second': '1977'} +{'loss': '0.7156', 'grad_norm': '1.596', 'learning_rate': '5e-05', 'epoch': '0.1171', 'num_input_tokens_seen': 9518550, 'train_runtime': '4814', 'train_tokens_per_second': '1977'} +{'loss': '1.091', 'grad_norm': '2.291', 'learning_rate': '5e-05', 'epoch': '0.1171', 'num_input_tokens_seen': 9520597, 'train_runtime': '4815', 'train_tokens_per_second': '1977'} +{'loss': '0.8779', 'grad_norm': '1.736', 'learning_rate': '5e-05', 'epoch': '0.1171', 'num_input_tokens_seen': 9522644, 'train_runtime': '4816', 'train_tokens_per_second': '1977'} +{'loss': '2.471', 'grad_norm': '2.692', 'learning_rate': '5e-05', 'epoch': '0.1172', 'num_input_tokens_seen': 9524691, 'train_runtime': '4817', 'train_tokens_per_second': '1977'} +{'loss': '0.7185', 'grad_norm': '1.939', 'learning_rate': '5e-05', 'epoch': '0.1172', 'num_input_tokens_seen': 9526738, 'train_runtime': '4818', 'train_tokens_per_second': '1977'} +{'loss': '0.4332', 'grad_norm': '1.086', 'learning_rate': '5e-05', 'epoch': '0.1172', 'num_input_tokens_seen': 9528785, 'train_runtime': '4819', 'train_tokens_per_second': '1977'} +{'loss': '1.411', 'grad_norm': '2.199', 'learning_rate': '5e-05', 'epoch': '0.1172', 'num_input_tokens_seen': 9530832, 'train_runtime': '4820', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '2.125', 'learning_rate': '5e-05', 'epoch': '0.1173', 'num_input_tokens_seen': 9532879, 'train_runtime': '4821', 'train_tokens_per_second': '1977'} +{'loss': '0.8906', 'grad_norm': '1.468', 'learning_rate': '5e-05', 'epoch': '0.1173', 'num_input_tokens_seen': 9534926, 'train_runtime': '4822', 'train_tokens_per_second': '1977'} +{'loss': '0.5444', 'grad_norm': '1.488', 'learning_rate': '5e-05', 'epoch': '0.1173', 'num_input_tokens_seen': 9536973, 'train_runtime': '4823', 'train_tokens_per_second': '1977'} +{'loss': '2.504', 'grad_norm': '2.546', 'learning_rate': '5e-05', 'epoch': '0.1173', 'num_input_tokens_seen': 9539020, 'train_runtime': '4824', 'train_tokens_per_second': '1977'} +{'loss': '0.7596', 'grad_norm': '1.857', 'learning_rate': '5e-05', 'epoch': '0.1174', 'num_input_tokens_seen': 9541067, 'train_runtime': '4826', 'train_tokens_per_second': '1977'} +{'loss': '0.4164', 'grad_norm': '1.315', 'learning_rate': '5e-05', 'epoch': '0.1174', 'num_input_tokens_seen': 9543114, 'train_runtime': '4827', 'train_tokens_per_second': '1977'} +{'loss': '0.5552', 'grad_norm': '1.442', 'learning_rate': '5e-05', 'epoch': '0.1174', 'num_input_tokens_seen': 9545161, 'train_runtime': '4828', 'train_tokens_per_second': '1977'} +{'loss': '0.5273', 'grad_norm': '1.392', 'learning_rate': '5e-05', 'epoch': '0.1174', 'num_input_tokens_seen': 9547208, 'train_runtime': '4829', 'train_tokens_per_second': '1977'} +{'loss': '1.103', 'grad_norm': '1.918', 'learning_rate': '5e-05', 'epoch': '0.1175', 'num_input_tokens_seen': 9549255, 'train_runtime': '4830', 'train_tokens_per_second': '1977'} +{'loss': '0.8735', 'grad_norm': '1.539', 'learning_rate': '5e-05', 'epoch': '0.1175', 'num_input_tokens_seen': 9551302, 'train_runtime': '4831', 'train_tokens_per_second': '1977'} +{'loss': '0.9447', 'grad_norm': '1.782', 'learning_rate': '5e-05', 'epoch': '0.1175', 'num_input_tokens_seen': 9553349, 'train_runtime': '4832', 'train_tokens_per_second': '1977'} +{'loss': '0.6975', 'grad_norm': '1.694', 'learning_rate': '5e-05', 'epoch': '0.1175', 'num_input_tokens_seen': 9555396, 'train_runtime': '4833', 'train_tokens_per_second': '1977'} +{'loss': '1.693', 'grad_norm': '2.383', 'learning_rate': '5e-05', 'epoch': '0.1176', 'num_input_tokens_seen': 9557443, 'train_runtime': '4834', 'train_tokens_per_second': '1977'} +{'loss': '1.427', 'grad_norm': '2.064', 'learning_rate': '5e-05', 'epoch': '0.1176', 'num_input_tokens_seen': 9559490, 'train_runtime': '4835', 'train_tokens_per_second': '1977'} +{'loss': '0.2665', 'grad_norm': '0.9166', 'learning_rate': '5e-05', 'epoch': '0.1176', 'num_input_tokens_seen': 9561537, 'train_runtime': '4836', 'train_tokens_per_second': '1977'} +{'loss': '0.8682', 'grad_norm': '1.702', 'learning_rate': '5e-05', 'epoch': '0.1176', 'num_input_tokens_seen': 9563584, 'train_runtime': '4837', 'train_tokens_per_second': '1977'} +{'loss': '0.9497', 'grad_norm': '1.805', 'learning_rate': '5e-05', 'epoch': '0.1177', 'num_input_tokens_seen': 9565631, 'train_runtime': '4838', 'train_tokens_per_second': '1977'} +{'loss': '0.9952', 'grad_norm': '1.549', 'learning_rate': '5e-05', 'epoch': '0.1177', 'num_input_tokens_seen': 9567678, 'train_runtime': '4839', 'train_tokens_per_second': '1977'} +{'loss': '0.8661', 'grad_norm': '1.702', 'learning_rate': '5e-05', 'epoch': '0.1177', 'num_input_tokens_seen': 9569725, 'train_runtime': '4840', 'train_tokens_per_second': '1977'} +{'loss': '0.3597', 'grad_norm': '1.252', 'learning_rate': '5e-05', 'epoch': '0.1177', 'num_input_tokens_seen': 9571772, 'train_runtime': '4841', 'train_tokens_per_second': '1977'} +{'loss': '0.7913', 'grad_norm': '1.465', 'learning_rate': '5e-05', 'epoch': '0.1178', 'num_input_tokens_seen': 9573819, 'train_runtime': '4842', 'train_tokens_per_second': '1977'} +{'loss': '0.7072', 'grad_norm': '1.837', 'learning_rate': '5e-05', 'epoch': '0.1178', 'num_input_tokens_seen': 9575866, 'train_runtime': '4843', 'train_tokens_per_second': '1977'} +{'loss': '1.466', 'grad_norm': '2.742', 'learning_rate': '5e-05', 'epoch': '0.1178', 'num_input_tokens_seen': 9577913, 'train_runtime': '4844', 'train_tokens_per_second': '1977'} +{'loss': '1.842', 'grad_norm': '2.954', 'learning_rate': '5e-05', 'epoch': '0.1178', 'num_input_tokens_seen': 9579960, 'train_runtime': '4845', 'train_tokens_per_second': '1977'} +{'loss': '0.3844', 'grad_norm': '1.318', 'learning_rate': '5e-05', 'epoch': '0.1179', 'num_input_tokens_seen': 9582007, 'train_runtime': '4846', 'train_tokens_per_second': '1977'} +{'loss': '0.3909', 'grad_norm': '1.189', 'learning_rate': '5e-05', 'epoch': '0.1179', 'num_input_tokens_seen': 9584054, 'train_runtime': '4847', 'train_tokens_per_second': '1977'} +{'loss': '0.4087', 'grad_norm': '1.103', 'learning_rate': '5e-05', 'epoch': '0.1179', 'num_input_tokens_seen': 9586101, 'train_runtime': '4848', 'train_tokens_per_second': '1977'} +{'loss': '0.5799', 'grad_norm': '1.628', 'learning_rate': '5e-05', 'epoch': '0.1179', 'num_input_tokens_seen': 9588148, 'train_runtime': '4849', 'train_tokens_per_second': '1977'} +{'loss': '0.3093', 'grad_norm': '1.095', 'learning_rate': '5e-05', 'epoch': '0.118', 'num_input_tokens_seen': 9590195, 'train_runtime': '4850', 'train_tokens_per_second': '1977'} +{'loss': '1.539', 'grad_norm': '2.194', 'learning_rate': '5e-05', 'epoch': '0.118', 'num_input_tokens_seen': 9592242, 'train_runtime': '4851', 'train_tokens_per_second': '1977'} +{'loss': '0.5905', 'grad_norm': '1.151', 'learning_rate': '5e-05', 'epoch': '0.118', 'num_input_tokens_seen': 9594289, 'train_runtime': '4852', 'train_tokens_per_second': '1977'} +{'loss': '0.4137', 'grad_norm': '1.351', 'learning_rate': '5e-05', 'epoch': '0.118', 'num_input_tokens_seen': 9596336, 'train_runtime': '4853', 'train_tokens_per_second': '1977'} +{'loss': '1.379', 'grad_norm': '1.942', 'learning_rate': '5e-05', 'epoch': '0.1181', 'num_input_tokens_seen': 9598383, 'train_runtime': '4855', 'train_tokens_per_second': '1977'} +{'loss': '1.051', 'grad_norm': '1.833', 'learning_rate': '5e-05', 'epoch': '0.1181', 'num_input_tokens_seen': 9600430, 'train_runtime': '4856', 'train_tokens_per_second': '1977'} +{'loss': '1.744', 'grad_norm': '2.575', 'learning_rate': '5e-05', 'epoch': '0.1181', 'num_input_tokens_seen': 9602477, 'train_runtime': '4857', 'train_tokens_per_second': '1977'} +{'loss': '1.228', 'grad_norm': '1.983', 'learning_rate': '5e-05', 'epoch': '0.1181', 'num_input_tokens_seen': 9604524, 'train_runtime': '4858', 'train_tokens_per_second': '1977'} +{'loss': '0.471', 'grad_norm': '1.41', 'learning_rate': '5e-05', 'epoch': '0.1182', 'num_input_tokens_seen': 9606571, 'train_runtime': '4859', 'train_tokens_per_second': '1977'} +{'loss': '0.444', 'grad_norm': '1.42', 'learning_rate': '5e-05', 'epoch': '0.1182', 'num_input_tokens_seen': 9608618, 'train_runtime': '4860', 'train_tokens_per_second': '1977'} +{'loss': '1.47', 'grad_norm': '2.43', 'learning_rate': '5e-05', 'epoch': '0.1182', 'num_input_tokens_seen': 9610665, 'train_runtime': '4861', 'train_tokens_per_second': '1977'} +{'loss': '0.8762', 'grad_norm': '2.059', 'learning_rate': '5e-05', 'epoch': '0.1182', 'num_input_tokens_seen': 9612712, 'train_runtime': '4862', 'train_tokens_per_second': '1977'} +{'loss': '0.8148', 'grad_norm': '1.442', 'learning_rate': '5e-05', 'epoch': '0.1183', 'num_input_tokens_seen': 9614759, 'train_runtime': '4863', 'train_tokens_per_second': '1977'} +{'loss': '1.021', 'grad_norm': '2.116', 'learning_rate': '5e-05', 'epoch': '0.1183', 'num_input_tokens_seen': 9616806, 'train_runtime': '4864', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.956', 'learning_rate': '5e-05', 'epoch': '0.1183', 'num_input_tokens_seen': 9618853, 'train_runtime': '4865', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '1.69', 'learning_rate': '5e-05', 'epoch': '0.1183', 'num_input_tokens_seen': 9620900, 'train_runtime': '4866', 'train_tokens_per_second': '1977'} +{'loss': '0.7633', 'grad_norm': '1.308', 'learning_rate': '5e-05', 'epoch': '0.1184', 'num_input_tokens_seen': 9622947, 'train_runtime': '4867', 'train_tokens_per_second': '1977'} +{'loss': '0.5824', 'grad_norm': '1.343', 'learning_rate': '5e-05', 'epoch': '0.1184', 'num_input_tokens_seen': 9624994, 'train_runtime': '4868', 'train_tokens_per_second': '1977'} +{'loss': '0.6027', 'grad_norm': '1.331', 'learning_rate': '5e-05', 'epoch': '0.1184', 'num_input_tokens_seen': 9627041, 'train_runtime': '4869', 'train_tokens_per_second': '1977'} +{'loss': '0.3304', 'grad_norm': '1.03', 'learning_rate': '5e-05', 'epoch': '0.1184', 'num_input_tokens_seen': 9629088, 'train_runtime': '4870', 'train_tokens_per_second': '1977'} +{'loss': '0.3218', 'grad_norm': '1.324', 'learning_rate': '5e-05', 'epoch': '0.1185', 'num_input_tokens_seen': 9631135, 'train_runtime': '4871', 'train_tokens_per_second': '1977'} +{'loss': '2.557', 'grad_norm': '2.691', 'learning_rate': '5e-05', 'epoch': '0.1185', 'num_input_tokens_seen': 9633182, 'train_runtime': '4872', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.76', 'learning_rate': '5e-05', 'epoch': '0.1185', 'num_input_tokens_seen': 9635229, 'train_runtime': '4873', 'train_tokens_per_second': '1977'} +{'loss': '0.7734', 'grad_norm': '1.683', 'learning_rate': '5e-05', 'epoch': '0.1185', 'num_input_tokens_seen': 9637276, 'train_runtime': '4874', 'train_tokens_per_second': '1977'} +{'loss': '0.2648', 'grad_norm': '1.171', 'learning_rate': '5e-05', 'epoch': '0.1186', 'num_input_tokens_seen': 9639323, 'train_runtime': '4875', 'train_tokens_per_second': '1977'} +{'loss': '2.209', 'grad_norm': '3.083', 'learning_rate': '5e-05', 'epoch': '0.1186', 'num_input_tokens_seen': 9641370, 'train_runtime': '4876', 'train_tokens_per_second': '1977'} +{'loss': '0.3877', 'grad_norm': '1.303', 'learning_rate': '5e-05', 'epoch': '0.1186', 'num_input_tokens_seen': 9643417, 'train_runtime': '4877', 'train_tokens_per_second': '1977'} +{'loss': '1.743', 'grad_norm': '2.834', 'learning_rate': '5e-05', 'epoch': '0.1186', 'num_input_tokens_seen': 9645464, 'train_runtime': '4878', 'train_tokens_per_second': '1977'} +{'loss': '0.8478', 'grad_norm': '1.793', 'learning_rate': '5e-05', 'epoch': '0.1187', 'num_input_tokens_seen': 9647511, 'train_runtime': '4879', 'train_tokens_per_second': '1977'} +{'loss': '0.6903', 'grad_norm': '1.215', 'learning_rate': '5e-05', 'epoch': '0.1187', 'num_input_tokens_seen': 9649558, 'train_runtime': '4880', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.944', 'learning_rate': '5e-05', 'epoch': '0.1187', 'num_input_tokens_seen': 9651605, 'train_runtime': '4881', 'train_tokens_per_second': '1977'} +{'loss': '0.324', 'grad_norm': '1.106', 'learning_rate': '5e-05', 'epoch': '0.1187', 'num_input_tokens_seen': 9653652, 'train_runtime': '4882', 'train_tokens_per_second': '1977'} +{'loss': '0.56', 'grad_norm': '1.691', 'learning_rate': '5e-05', 'epoch': '0.1188', 'num_input_tokens_seen': 9655699, 'train_runtime': '4883', 'train_tokens_per_second': '1977'} +{'loss': '1.25', 'grad_norm': '1.9', 'learning_rate': '5e-05', 'epoch': '0.1188', 'num_input_tokens_seen': 9657746, 'train_runtime': '4885', 'train_tokens_per_second': '1977'} +{'loss': '0.7544', 'grad_norm': '1.572', 'learning_rate': '5e-05', 'epoch': '0.1188', 'num_input_tokens_seen': 9659793, 'train_runtime': '4886', 'train_tokens_per_second': '1977'} +{'loss': '2.406', 'grad_norm': '2.377', 'learning_rate': '5e-05', 'epoch': '0.1188', 'num_input_tokens_seen': 9661840, 'train_runtime': '4887', 'train_tokens_per_second': '1977'} +{'loss': '0.3956', 'grad_norm': '1.256', 'learning_rate': '5e-05', 'epoch': '0.1189', 'num_input_tokens_seen': 9663887, 'train_runtime': '4888', 'train_tokens_per_second': '1977'} +{'loss': '0.8978', 'grad_norm': '1.704', 'learning_rate': '5e-05', 'epoch': '0.1189', 'num_input_tokens_seen': 9665934, 'train_runtime': '4889', 'train_tokens_per_second': '1977'} +{'loss': '0.5235', 'grad_norm': '1.474', 'learning_rate': '5e-05', 'epoch': '0.1189', 'num_input_tokens_seen': 9667981, 'train_runtime': '4890', 'train_tokens_per_second': '1977'} +{'loss': '2.144', 'grad_norm': '2.582', 'learning_rate': '5e-05', 'epoch': '0.1189', 'num_input_tokens_seen': 9670028, 'train_runtime': '4891', 'train_tokens_per_second': '1977'} +{'loss': '0.4301', 'grad_norm': '1.131', 'learning_rate': '5e-05', 'epoch': '0.119', 'num_input_tokens_seen': 9672075, 'train_runtime': '4892', 'train_tokens_per_second': '1977'} +{'loss': '1.157', 'grad_norm': '2.472', 'learning_rate': '5e-05', 'epoch': '0.119', 'num_input_tokens_seen': 9674122, 'train_runtime': '4893', 'train_tokens_per_second': '1977'} +{'loss': '2.437', 'grad_norm': '2.281', 'learning_rate': '5e-05', 'epoch': '0.119', 'num_input_tokens_seen': 9676169, 'train_runtime': '4894', 'train_tokens_per_second': '1977'} +{'loss': '0.62', 'grad_norm': '1.824', 'learning_rate': '5e-05', 'epoch': '0.119', 'num_input_tokens_seen': 9678216, 'train_runtime': '4895', 'train_tokens_per_second': '1977'} +{'loss': '0.2645', 'grad_norm': '0.9688', 'learning_rate': '5e-05', 'epoch': '0.1191', 'num_input_tokens_seen': 9680263, 'train_runtime': '4896', 'train_tokens_per_second': '1977'} +{'loss': '1.476', 'grad_norm': '2.158', 'learning_rate': '5e-05', 'epoch': '0.1191', 'num_input_tokens_seen': 9682310, 'train_runtime': '4897', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '1.736', 'learning_rate': '5e-05', 'epoch': '0.1191', 'num_input_tokens_seen': 9684357, 'train_runtime': '4898', 'train_tokens_per_second': '1977'} +{'loss': '0.4267', 'grad_norm': '1.188', 'learning_rate': '5e-05', 'epoch': '0.1191', 'num_input_tokens_seen': 9686404, 'train_runtime': '4899', 'train_tokens_per_second': '1977'} +{'loss': '1.705', 'grad_norm': '2.129', 'learning_rate': '5e-05', 'epoch': '0.1192', 'num_input_tokens_seen': 9688451, 'train_runtime': '4900', 'train_tokens_per_second': '1977'} +{'loss': '0.6158', 'grad_norm': '1.547', 'learning_rate': '5e-05', 'epoch': '0.1192', 'num_input_tokens_seen': 9690498, 'train_runtime': '4901', 'train_tokens_per_second': '1977'} +{'loss': '1.63', 'grad_norm': '2.392', 'learning_rate': '5e-05', 'epoch': '0.1192', 'num_input_tokens_seen': 9692545, 'train_runtime': '4902', 'train_tokens_per_second': '1977'} +{'loss': '0.8881', 'grad_norm': '1.561', 'learning_rate': '5e-05', 'epoch': '0.1192', 'num_input_tokens_seen': 9694592, 'train_runtime': '4903', 'train_tokens_per_second': '1977'} +{'loss': '1.189', 'grad_norm': '1.791', 'learning_rate': '5e-05', 'epoch': '0.1193', 'num_input_tokens_seen': 9696639, 'train_runtime': '4904', 'train_tokens_per_second': '1977'} +{'loss': '0.7716', 'grad_norm': '1.679', 'learning_rate': '5e-05', 'epoch': '0.1193', 'num_input_tokens_seen': 9698686, 'train_runtime': '4905', 'train_tokens_per_second': '1977'} +{'loss': '0.874', 'grad_norm': '1.51', 'learning_rate': '5e-05', 'epoch': '0.1193', 'num_input_tokens_seen': 9700733, 'train_runtime': '4906', 'train_tokens_per_second': '1977'} +{'loss': '0.7597', 'grad_norm': '2.027', 'learning_rate': '5e-05', 'epoch': '0.1193', 'num_input_tokens_seen': 9702780, 'train_runtime': '4907', 'train_tokens_per_second': '1977'} +{'loss': '0.9242', 'grad_norm': '1.549', 'learning_rate': '5e-05', 'epoch': '0.1194', 'num_input_tokens_seen': 9704827, 'train_runtime': '4908', 'train_tokens_per_second': '1977'} +{'loss': '0.6879', 'grad_norm': '1.25', 'learning_rate': '5e-05', 'epoch': '0.1194', 'num_input_tokens_seen': 9706874, 'train_runtime': '4909', 'train_tokens_per_second': '1977'} +{'loss': '0.8164', 'grad_norm': '1.274', 'learning_rate': '5e-05', 'epoch': '0.1194', 'num_input_tokens_seen': 9708921, 'train_runtime': '4910', 'train_tokens_per_second': '1977'} +{'loss': '1.239', 'grad_norm': '2.137', 'learning_rate': '5e-05', 'epoch': '0.1194', 'num_input_tokens_seen': 9710968, 'train_runtime': '4911', 'train_tokens_per_second': '1977'} +{'loss': '1.341', 'grad_norm': '1.972', 'learning_rate': '5e-05', 'epoch': '0.1195', 'num_input_tokens_seen': 9713015, 'train_runtime': '4912', 'train_tokens_per_second': '1977'} +{'loss': '0.2951', 'grad_norm': '0.9808', 'learning_rate': '5e-05', 'epoch': '0.1195', 'num_input_tokens_seen': 9715062, 'train_runtime': '4914', 'train_tokens_per_second': '1977'} +{'loss': '0.6832', 'grad_norm': '1.988', 'learning_rate': '5e-05', 'epoch': '0.1195', 'num_input_tokens_seen': 9717109, 'train_runtime': '4915', 'train_tokens_per_second': '1977'} +{'loss': '0.7815', 'grad_norm': '1.257', 'learning_rate': '5e-05', 'epoch': '0.1195', 'num_input_tokens_seen': 9719156, 'train_runtime': '4916', 'train_tokens_per_second': '1977'} +{'loss': '1.352', 'grad_norm': '2.032', 'learning_rate': '5e-05', 'epoch': '0.1196', 'num_input_tokens_seen': 9721203, 'train_runtime': '4917', 'train_tokens_per_second': '1977'} +{'loss': '0.581', 'grad_norm': '1.702', 'learning_rate': '5e-05', 'epoch': '0.1196', 'num_input_tokens_seen': 9723250, 'train_runtime': '4918', 'train_tokens_per_second': '1977'} +{'loss': '1.479', 'grad_norm': '2.465', 'learning_rate': '5e-05', 'epoch': '0.1196', 'num_input_tokens_seen': 9725297, 'train_runtime': '4919', 'train_tokens_per_second': '1977'} +{'loss': '1.772', 'grad_norm': '2.237', 'learning_rate': '5e-05', 'epoch': '0.1196', 'num_input_tokens_seen': 9727344, 'train_runtime': '4920', 'train_tokens_per_second': '1977'} +{'loss': '0.4817', 'grad_norm': '1.32', 'learning_rate': '5e-05', 'epoch': '0.1197', 'num_input_tokens_seen': 9729391, 'train_runtime': '4921', 'train_tokens_per_second': '1977'} +{'loss': '0.5925', 'grad_norm': '1.271', 'learning_rate': '5e-05', 'epoch': '0.1197', 'num_input_tokens_seen': 9731438, 'train_runtime': '4922', 'train_tokens_per_second': '1977'} +{'loss': '0.2483', 'grad_norm': '0.9247', 'learning_rate': '5e-05', 'epoch': '0.1197', 'num_input_tokens_seen': 9733485, 'train_runtime': '4923', 'train_tokens_per_second': '1977'} +{'loss': '0.5967', 'grad_norm': '1.3', 'learning_rate': '5e-05', 'epoch': '0.1197', 'num_input_tokens_seen': 9735532, 'train_runtime': '4924', 'train_tokens_per_second': '1977'} +{'loss': '0.9071', 'grad_norm': '1.705', 'learning_rate': '5e-05', 'epoch': '0.1198', 'num_input_tokens_seen': 9737579, 'train_runtime': '4925', 'train_tokens_per_second': '1977'} +{'loss': '0.2902', 'grad_norm': '1.235', 'learning_rate': '5e-05', 'epoch': '0.1198', 'num_input_tokens_seen': 9739626, 'train_runtime': '4926', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '1.842', 'learning_rate': '5e-05', 'epoch': '0.1198', 'num_input_tokens_seen': 9741673, 'train_runtime': '4927', 'train_tokens_per_second': '1977'} +{'loss': '0.3098', 'grad_norm': '1.3', 'learning_rate': '5e-05', 'epoch': '0.1198', 'num_input_tokens_seen': 9743720, 'train_runtime': '4928', 'train_tokens_per_second': '1977'} +{'loss': '0.8083', 'grad_norm': '1.305', 'learning_rate': '5e-05', 'epoch': '0.1199', 'num_input_tokens_seen': 9745767, 'train_runtime': '4929', 'train_tokens_per_second': '1977'} +{'loss': '0.4126', 'grad_norm': '1.091', 'learning_rate': '5e-05', 'epoch': '0.1199', 'num_input_tokens_seen': 9747814, 'train_runtime': '4930', 'train_tokens_per_second': '1977'} +{'loss': '0.6846', 'grad_norm': '1.62', 'learning_rate': '5e-05', 'epoch': '0.1199', 'num_input_tokens_seen': 9749861, 'train_runtime': '4931', 'train_tokens_per_second': '1977'} +{'loss': '0.8167', 'grad_norm': '2.316', 'learning_rate': '5e-05', 'epoch': '0.1199', 'num_input_tokens_seen': 9751908, 'train_runtime': '4932', 'train_tokens_per_second': '1977'} +{'loss': '0.7826', 'grad_norm': '1.526', 'learning_rate': '5e-05', 'epoch': '0.12', 'num_input_tokens_seen': 9753955, 'train_runtime': '4933', 'train_tokens_per_second': '1977'} +{'loss': '0.6444', 'grad_norm': '1.212', 'learning_rate': '5e-05', 'epoch': '0.12', 'num_input_tokens_seen': 9756002, 'train_runtime': '4934', 'train_tokens_per_second': '1977'} +{'loss': '1.307', 'grad_norm': '1.868', 'learning_rate': '5e-05', 'epoch': '0.12', 'num_input_tokens_seen': 9758049, 'train_runtime': '4935', 'train_tokens_per_second': '1977'} +{'loss': '2.05', 'grad_norm': '2.328', 'learning_rate': '5e-05', 'epoch': '0.12', 'num_input_tokens_seen': 9760096, 'train_runtime': '4936', 'train_tokens_per_second': '1977'} +{'loss': '0.7226', 'grad_norm': '1.654', 'learning_rate': '5e-05', 'epoch': '0.1201', 'num_input_tokens_seen': 9762143, 'train_runtime': '4937', 'train_tokens_per_second': '1977'} +{'loss': '0.328', 'grad_norm': '1.074', 'learning_rate': '5e-05', 'epoch': '0.1201', 'num_input_tokens_seen': 9764190, 'train_runtime': '4938', 'train_tokens_per_second': '1977'} +{'loss': '0.9118', 'grad_norm': '1.598', 'learning_rate': '5e-05', 'epoch': '0.1201', 'num_input_tokens_seen': 9766237, 'train_runtime': '4939', 'train_tokens_per_second': '1977'} +{'loss': '2.267', 'grad_norm': '2.509', 'learning_rate': '5e-05', 'epoch': '0.1202', 'num_input_tokens_seen': 9768284, 'train_runtime': '4940', 'train_tokens_per_second': '1977'} +{'loss': '0.4992', 'grad_norm': '1.491', 'learning_rate': '5e-05', 'epoch': '0.1202', 'num_input_tokens_seen': 9770331, 'train_runtime': '4942', 'train_tokens_per_second': '1977'} +{'loss': '0.578', 'grad_norm': '1.106', 'learning_rate': '5e-05', 'epoch': '0.1202', 'num_input_tokens_seen': 9772378, 'train_runtime': '4943', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '2.359', 'learning_rate': '5e-05', 'epoch': '0.1202', 'num_input_tokens_seen': 9774425, 'train_runtime': '4944', 'train_tokens_per_second': '1977'} +{'loss': '0.4043', 'grad_norm': '1.222', 'learning_rate': '5e-05', 'epoch': '0.1203', 'num_input_tokens_seen': 9776472, 'train_runtime': '4945', 'train_tokens_per_second': '1977'} +{'loss': '1.323', 'grad_norm': '2.297', 'learning_rate': '5e-05', 'epoch': '0.1203', 'num_input_tokens_seen': 9778519, 'train_runtime': '4946', 'train_tokens_per_second': '1977'} +{'loss': '0.4069', 'grad_norm': '0.9982', 'learning_rate': '5e-05', 'epoch': '0.1203', 'num_input_tokens_seen': 9780566, 'train_runtime': '4947', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '2.091', 'learning_rate': '5e-05', 'epoch': '0.1203', 'num_input_tokens_seen': 9782613, 'train_runtime': '4948', 'train_tokens_per_second': '1977'} +{'loss': '1.31', 'grad_norm': '2.253', 'learning_rate': '5e-05', 'epoch': '0.1204', 'num_input_tokens_seen': 9784660, 'train_runtime': '4949', 'train_tokens_per_second': '1977'} +{'loss': '1.725', 'grad_norm': '2.608', 'learning_rate': '5e-05', 'epoch': '0.1204', 'num_input_tokens_seen': 9786707, 'train_runtime': '4950', 'train_tokens_per_second': '1977'} +{'loss': '1.652', 'grad_norm': '2.734', 'learning_rate': '5e-05', 'epoch': '0.1204', 'num_input_tokens_seen': 9788754, 'train_runtime': '4951', 'train_tokens_per_second': '1977'} +{'loss': '0.7634', 'grad_norm': '1.926', 'learning_rate': '5e-05', 'epoch': '0.1204', 'num_input_tokens_seen': 9790801, 'train_runtime': '4952', 'train_tokens_per_second': '1977'} +{'loss': '1.067', 'grad_norm': '1.859', 'learning_rate': '5e-05', 'epoch': '0.1205', 'num_input_tokens_seen': 9792848, 'train_runtime': '4953', 'train_tokens_per_second': '1977'} +{'loss': '0.64', 'grad_norm': '1.733', 'learning_rate': '5e-05', 'epoch': '0.1205', 'num_input_tokens_seen': 9794895, 'train_runtime': '4954', 'train_tokens_per_second': '1977'} +{'loss': '0.3436', 'grad_norm': '1.054', 'learning_rate': '5e-05', 'epoch': '0.1205', 'num_input_tokens_seen': 9796942, 'train_runtime': '4955', 'train_tokens_per_second': '1977'} +{'loss': '0.4199', 'grad_norm': '1.066', 'learning_rate': '5e-05', 'epoch': '0.1205', 'num_input_tokens_seen': 9798989, 'train_runtime': '4956', 'train_tokens_per_second': '1977'} +{'loss': '0.41', 'grad_norm': '1.432', 'learning_rate': '5e-05', 'epoch': '0.1206', 'num_input_tokens_seen': 9801036, 'train_runtime': '4957', 'train_tokens_per_second': '1977'} +{'loss': '0.4662', 'grad_norm': '1.279', 'learning_rate': '5e-05', 'epoch': '0.1206', 'num_input_tokens_seen': 9803083, 'train_runtime': '4958', 'train_tokens_per_second': '1977'} +{'loss': '0.3275', 'grad_norm': '0.9912', 'learning_rate': '5e-05', 'epoch': '0.1206', 'num_input_tokens_seen': 9805130, 'train_runtime': '4959', 'train_tokens_per_second': '1977'} +{'loss': '0.7675', 'grad_norm': '1.701', 'learning_rate': '5e-05', 'epoch': '0.1206', 'num_input_tokens_seen': 9807177, 'train_runtime': '4960', 'train_tokens_per_second': '1977'} +{'loss': '0.3386', 'grad_norm': '1.331', 'learning_rate': '5e-05', 'epoch': '0.1207', 'num_input_tokens_seen': 9809224, 'train_runtime': '4961', 'train_tokens_per_second': '1977'} +{'loss': '0.5619', 'grad_norm': '1.704', 'learning_rate': '5e-05', 'epoch': '0.1207', 'num_input_tokens_seen': 9811271, 'train_runtime': '4962', 'train_tokens_per_second': '1977'} +{'loss': '0.4528', 'grad_norm': '1.383', 'learning_rate': '5e-05', 'epoch': '0.1207', 'num_input_tokens_seen': 9813318, 'train_runtime': '4963', 'train_tokens_per_second': '1977'} +{'loss': '1.318', 'grad_norm': '2.037', 'learning_rate': '5e-05', 'epoch': '0.1207', 'num_input_tokens_seen': 9815365, 'train_runtime': '4964', 'train_tokens_per_second': '1977'} +{'loss': '1.781', 'grad_norm': '2.536', 'learning_rate': '5e-05', 'epoch': '0.1208', 'num_input_tokens_seen': 9817412, 'train_runtime': '4965', 'train_tokens_per_second': '1977'} +{'loss': '0.4499', 'grad_norm': '1.282', 'learning_rate': '5e-05', 'epoch': '0.1208', 'num_input_tokens_seen': 9819459, 'train_runtime': '4966', 'train_tokens_per_second': '1977'} +{'loss': '0.9252', 'grad_norm': '1.668', 'learning_rate': '5e-05', 'epoch': '0.1208', 'num_input_tokens_seen': 9821506, 'train_runtime': '4967', 'train_tokens_per_second': '1977'} +{'loss': '1.967', 'grad_norm': '2.958', 'learning_rate': '5e-05', 'epoch': '0.1208', 'num_input_tokens_seen': 9823553, 'train_runtime': '4968', 'train_tokens_per_second': '1977'} +{'loss': '1.35', 'grad_norm': '2.18', 'learning_rate': '5e-05', 'epoch': '0.1209', 'num_input_tokens_seen': 9825600, 'train_runtime': '4969', 'train_tokens_per_second': '1977'} +{'loss': '0.3256', 'grad_norm': '1.171', 'learning_rate': '5e-05', 'epoch': '0.1209', 'num_input_tokens_seen': 9827647, 'train_runtime': '4971', 'train_tokens_per_second': '1977'} +{'loss': '0.4357', 'grad_norm': '1.266', 'learning_rate': '5e-05', 'epoch': '0.1209', 'num_input_tokens_seen': 9829694, 'train_runtime': '4972', 'train_tokens_per_second': '1977'} +{'loss': '0.8362', 'grad_norm': '1.79', 'learning_rate': '5e-05', 'epoch': '0.1209', 'num_input_tokens_seen': 9831741, 'train_runtime': '4973', 'train_tokens_per_second': '1977'} +{'loss': '0.4202', 'grad_norm': '1.042', 'learning_rate': '5e-05', 'epoch': '0.121', 'num_input_tokens_seen': 9833788, 'train_runtime': '4974', 'train_tokens_per_second': '1977'} +{'loss': '0.6745', 'grad_norm': '1.569', 'learning_rate': '5e-05', 'epoch': '0.121', 'num_input_tokens_seen': 9835835, 'train_runtime': '4975', 'train_tokens_per_second': '1977'} +{'loss': '0.4879', 'grad_norm': '0.9509', 'learning_rate': '5e-05', 'epoch': '0.121', 'num_input_tokens_seen': 9837882, 'train_runtime': '4976', 'train_tokens_per_second': '1977'} +{'loss': '2.727', 'grad_norm': '2.762', 'learning_rate': '5e-05', 'epoch': '0.121', 'num_input_tokens_seen': 9839929, 'train_runtime': '4977', 'train_tokens_per_second': '1977'} +{'loss': '0.4838', 'grad_norm': '1.207', 'learning_rate': '5e-05', 'epoch': '0.1211', 'num_input_tokens_seen': 9841976, 'train_runtime': '4978', 'train_tokens_per_second': '1977'} +{'loss': '0.8261', 'grad_norm': '1.142', 'learning_rate': '5e-05', 'epoch': '0.1211', 'num_input_tokens_seen': 9844023, 'train_runtime': '4979', 'train_tokens_per_second': '1977'} +{'loss': '0.3238', 'grad_norm': '1.3', 'learning_rate': '5e-05', 'epoch': '0.1211', 'num_input_tokens_seen': 9846070, 'train_runtime': '4980', 'train_tokens_per_second': '1977'} +{'loss': '0.382', 'grad_norm': '1.253', 'learning_rate': '5e-05', 'epoch': '0.1211', 'num_input_tokens_seen': 9848117, 'train_runtime': '4981', 'train_tokens_per_second': '1977'} +{'loss': '0.5291', 'grad_norm': '1.146', 'learning_rate': '5e-05', 'epoch': '0.1212', 'num_input_tokens_seen': 9850164, 'train_runtime': '4982', 'train_tokens_per_second': '1977'} +{'loss': '1.454', 'grad_norm': '2.427', 'learning_rate': '5e-05', 'epoch': '0.1212', 'num_input_tokens_seen': 9852211, 'train_runtime': '4983', 'train_tokens_per_second': '1977'} +{'loss': '0.6045', 'grad_norm': '1.278', 'learning_rate': '5e-05', 'epoch': '0.1212', 'num_input_tokens_seen': 9854258, 'train_runtime': '4984', 'train_tokens_per_second': '1977'} +{'loss': '1.505', 'grad_norm': '1.993', 'learning_rate': '5e-05', 'epoch': '0.1212', 'num_input_tokens_seen': 9856305, 'train_runtime': '4985', 'train_tokens_per_second': '1977'} +{'loss': '2.999', 'grad_norm': '2.633', 'learning_rate': '5e-05', 'epoch': '0.1213', 'num_input_tokens_seen': 9858352, 'train_runtime': '4986', 'train_tokens_per_second': '1977'} +{'loss': '0.9399', 'grad_norm': '1.513', 'learning_rate': '5e-05', 'epoch': '0.1213', 'num_input_tokens_seen': 9860399, 'train_runtime': '4987', 'train_tokens_per_second': '1977'} +{'loss': '0.3585', 'grad_norm': '1.528', 'learning_rate': '5e-05', 'epoch': '0.1213', 'num_input_tokens_seen': 9862446, 'train_runtime': '4988', 'train_tokens_per_second': '1977'} +{'loss': '0.2893', 'grad_norm': '1.184', 'learning_rate': '5e-05', 'epoch': '0.1213', 'num_input_tokens_seen': 9864493, 'train_runtime': '4989', 'train_tokens_per_second': '1977'} +{'loss': '0.8158', 'grad_norm': '1.427', 'learning_rate': '5e-05', 'epoch': '0.1214', 'num_input_tokens_seen': 9866540, 'train_runtime': '4990', 'train_tokens_per_second': '1977'} +{'loss': '1.381', 'grad_norm': '2.048', 'learning_rate': '5e-05', 'epoch': '0.1214', 'num_input_tokens_seen': 9868587, 'train_runtime': '4991', 'train_tokens_per_second': '1977'} +{'loss': '0.6401', 'grad_norm': '1.308', 'learning_rate': '5e-05', 'epoch': '0.1214', 'num_input_tokens_seen': 9870634, 'train_runtime': '4992', 'train_tokens_per_second': '1977'} +{'loss': '1.296', 'grad_norm': '2.116', 'learning_rate': '5e-05', 'epoch': '0.1214', 'num_input_tokens_seen': 9872681, 'train_runtime': '4993', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '2.361', 'learning_rate': '5e-05', 'epoch': '0.1215', 'num_input_tokens_seen': 9874728, 'train_runtime': '4994', 'train_tokens_per_second': '1977'} +{'loss': '0.6087', 'grad_norm': '1.79', 'learning_rate': '5e-05', 'epoch': '0.1215', 'num_input_tokens_seen': 9876775, 'train_runtime': '4995', 'train_tokens_per_second': '1977'} +{'loss': '0.8725', 'grad_norm': '1.394', 'learning_rate': '5e-05', 'epoch': '0.1215', 'num_input_tokens_seen': 9878822, 'train_runtime': '4996', 'train_tokens_per_second': '1977'} +{'loss': '0.3758', 'grad_norm': '1.225', 'learning_rate': '5e-05', 'epoch': '0.1215', 'num_input_tokens_seen': 9880869, 'train_runtime': '4997', 'train_tokens_per_second': '1977'} +{'loss': '0.8584', 'grad_norm': '1.696', 'learning_rate': '5e-05', 'epoch': '0.1216', 'num_input_tokens_seen': 9882916, 'train_runtime': '4998', 'train_tokens_per_second': '1977'} +{'loss': '0.6669', 'grad_norm': '1.348', 'learning_rate': '5e-05', 'epoch': '0.1216', 'num_input_tokens_seen': 9884963, 'train_runtime': '5000', 'train_tokens_per_second': '1977'} +{'loss': '1.454', 'grad_norm': '3.065', 'learning_rate': '5e-05', 'epoch': '0.1216', 'num_input_tokens_seen': 9887010, 'train_runtime': '5001', 'train_tokens_per_second': '1977'} +{'loss': '0.4133', 'grad_norm': '1.289', 'learning_rate': '5e-05', 'epoch': '0.1216', 'num_input_tokens_seen': 9889057, 'train_runtime': '5002', 'train_tokens_per_second': '1977'} +{'loss': '1.538', 'grad_norm': '2.452', 'learning_rate': '5e-05', 'epoch': '0.1217', 'num_input_tokens_seen': 9891104, 'train_runtime': '5003', 'train_tokens_per_second': '1977'} +{'loss': '1.912', 'grad_norm': '2.902', 'learning_rate': '5e-05', 'epoch': '0.1217', 'num_input_tokens_seen': 9893151, 'train_runtime': '5004', 'train_tokens_per_second': '1977'} +{'loss': '0.5276', 'grad_norm': '1.407', 'learning_rate': '5e-05', 'epoch': '0.1217', 'num_input_tokens_seen': 9895198, 'train_runtime': '5005', 'train_tokens_per_second': '1977'} +{'loss': '0.9094', 'grad_norm': '1.305', 'learning_rate': '5e-05', 'epoch': '0.1217', 'num_input_tokens_seen': 9897245, 'train_runtime': '5006', 'train_tokens_per_second': '1977'} +{'loss': '0.3695', 'grad_norm': '1.008', 'learning_rate': '5e-05', 'epoch': '0.1218', 'num_input_tokens_seen': 9899292, 'train_runtime': '5007', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '1.711', 'learning_rate': '5e-05', 'epoch': '0.1218', 'num_input_tokens_seen': 9901339, 'train_runtime': '5008', 'train_tokens_per_second': '1977'} +{'loss': '1.221', 'grad_norm': '1.842', 'learning_rate': '5e-05', 'epoch': '0.1218', 'num_input_tokens_seen': 9903386, 'train_runtime': '5009', 'train_tokens_per_second': '1977'} +{'loss': '0.9644', 'grad_norm': '1.718', 'learning_rate': '5e-05', 'epoch': '0.1218', 'num_input_tokens_seen': 9905433, 'train_runtime': '5010', 'train_tokens_per_second': '1977'} +{'loss': '0.571', 'grad_norm': '1.32', 'learning_rate': '5e-05', 'epoch': '0.1219', 'num_input_tokens_seen': 9907480, 'train_runtime': '5011', 'train_tokens_per_second': '1977'} +{'loss': '0.8366', 'grad_norm': '1.373', 'learning_rate': '5e-05', 'epoch': '0.1219', 'num_input_tokens_seen': 9909527, 'train_runtime': '5012', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.719', 'learning_rate': '5e-05', 'epoch': '0.1219', 'num_input_tokens_seen': 9911574, 'train_runtime': '5013', 'train_tokens_per_second': '1977'} +{'loss': '1.176', 'grad_norm': '1.751', 'learning_rate': '5e-05', 'epoch': '0.1219', 'num_input_tokens_seen': 9913621, 'train_runtime': '5014', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '2.127', 'learning_rate': '5e-05', 'epoch': '0.122', 'num_input_tokens_seen': 9915668, 'train_runtime': '5015', 'train_tokens_per_second': '1977'} +{'loss': '1.315', 'grad_norm': '2.042', 'learning_rate': '5e-05', 'epoch': '0.122', 'num_input_tokens_seen': 9917715, 'train_runtime': '5016', 'train_tokens_per_second': '1977'} +{'loss': '2.198', 'grad_norm': '2.516', 'learning_rate': '5e-05', 'epoch': '0.122', 'num_input_tokens_seen': 9919762, 'train_runtime': '5017', 'train_tokens_per_second': '1977'} +{'loss': '0.3488', 'grad_norm': '1.054', 'learning_rate': '5e-05', 'epoch': '0.122', 'num_input_tokens_seen': 9921809, 'train_runtime': '5018', 'train_tokens_per_second': '1977'} +{'loss': '0.9315', 'grad_norm': '1.837', 'learning_rate': '5e-05', 'epoch': '0.1221', 'num_input_tokens_seen': 9923856, 'train_runtime': '5019', 'train_tokens_per_second': '1977'} +{'loss': '0.8372', 'grad_norm': '1.384', 'learning_rate': '5e-05', 'epoch': '0.1221', 'num_input_tokens_seen': 9925903, 'train_runtime': '5020', 'train_tokens_per_second': '1977'} +{'loss': '0.4225', 'grad_norm': '1.245', 'learning_rate': '5e-05', 'epoch': '0.1221', 'num_input_tokens_seen': 9927950, 'train_runtime': '5021', 'train_tokens_per_second': '1977'} +{'loss': '2.261', 'grad_norm': '2.53', 'learning_rate': '5e-05', 'epoch': '0.1221', 'num_input_tokens_seen': 9929997, 'train_runtime': '5022', 'train_tokens_per_second': '1977'} +{'loss': '0.7387', 'grad_norm': '1.596', 'learning_rate': '5e-05', 'epoch': '0.1222', 'num_input_tokens_seen': 9932044, 'train_runtime': '5023', 'train_tokens_per_second': '1977'} +{'loss': '0.331', 'grad_norm': '1.048', 'learning_rate': '5e-05', 'epoch': '0.1222', 'num_input_tokens_seen': 9934091, 'train_runtime': '5024', 'train_tokens_per_second': '1977'} +{'loss': '1.154', 'grad_norm': '1.876', 'learning_rate': '5e-05', 'epoch': '0.1222', 'num_input_tokens_seen': 9936138, 'train_runtime': '5025', 'train_tokens_per_second': '1977'} +{'loss': '0.8302', 'grad_norm': '1.458', 'learning_rate': '5e-05', 'epoch': '0.1222', 'num_input_tokens_seen': 9938185, 'train_runtime': '5026', 'train_tokens_per_second': '1977'} +{'loss': '2.172', 'grad_norm': '2.838', 'learning_rate': '5e-05', 'epoch': '0.1223', 'num_input_tokens_seen': 9940232, 'train_runtime': '5027', 'train_tokens_per_second': '1977'} +{'loss': '0.452', 'grad_norm': '1.09', 'learning_rate': '5e-05', 'epoch': '0.1223', 'num_input_tokens_seen': 9942279, 'train_runtime': '5029', 'train_tokens_per_second': '1977'} +{'loss': '0.5878', 'grad_norm': '1.538', 'learning_rate': '5e-05', 'epoch': '0.1223', 'num_input_tokens_seen': 9944326, 'train_runtime': '5030', 'train_tokens_per_second': '1977'} +{'loss': '0.7177', 'grad_norm': '1.975', 'learning_rate': '5e-05', 'epoch': '0.1223', 'num_input_tokens_seen': 9946373, 'train_runtime': '5031', 'train_tokens_per_second': '1977'} +{'loss': '0.5382', 'grad_norm': '1.476', 'learning_rate': '5e-05', 'epoch': '0.1224', 'num_input_tokens_seen': 9948420, 'train_runtime': '5032', 'train_tokens_per_second': '1977'} +{'loss': '0.3432', 'grad_norm': '1.466', 'learning_rate': '5e-05', 'epoch': '0.1224', 'num_input_tokens_seen': 9950467, 'train_runtime': '5033', 'train_tokens_per_second': '1977'} +{'loss': '0.9174', 'grad_norm': '1.749', 'learning_rate': '5e-05', 'epoch': '0.1224', 'num_input_tokens_seen': 9952514, 'train_runtime': '5034', 'train_tokens_per_second': '1977'} +{'loss': '2.001', 'grad_norm': '2.34', 'learning_rate': '5e-05', 'epoch': '0.1224', 'num_input_tokens_seen': 9954561, 'train_runtime': '5035', 'train_tokens_per_second': '1977'} +{'loss': '0.3615', 'grad_norm': '0.9509', 'learning_rate': '5e-05', 'epoch': '0.1225', 'num_input_tokens_seen': 9956608, 'train_runtime': '5036', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '1.966', 'learning_rate': '5e-05', 'epoch': '0.1225', 'num_input_tokens_seen': 9958655, 'train_runtime': '5037', 'train_tokens_per_second': '1977'} +{'loss': '1.417', 'grad_norm': '2.199', 'learning_rate': '5e-05', 'epoch': '0.1225', 'num_input_tokens_seen': 9960702, 'train_runtime': '5038', 'train_tokens_per_second': '1977'} +{'loss': '2.028', 'grad_norm': '2.464', 'learning_rate': '5e-05', 'epoch': '0.1225', 'num_input_tokens_seen': 9962749, 'train_runtime': '5039', 'train_tokens_per_second': '1977'} +{'loss': '1.095', 'grad_norm': '1.98', 'learning_rate': '5e-05', 'epoch': '0.1226', 'num_input_tokens_seen': 9964796, 'train_runtime': '5040', 'train_tokens_per_second': '1977'} +{'loss': '0.7446', 'grad_norm': '1.74', 'learning_rate': '5e-05', 'epoch': '0.1226', 'num_input_tokens_seen': 9966843, 'train_runtime': '5041', 'train_tokens_per_second': '1977'} +{'loss': '0.4798', 'grad_norm': '1.344', 'learning_rate': '5e-05', 'epoch': '0.1226', 'num_input_tokens_seen': 9968890, 'train_runtime': '5042', 'train_tokens_per_second': '1977'} +{'loss': '0.968', 'grad_norm': '1.44', 'learning_rate': '5e-05', 'epoch': '0.1226', 'num_input_tokens_seen': 9970937, 'train_runtime': '5043', 'train_tokens_per_second': '1977'} +{'loss': '0.5567', 'grad_norm': '1.689', 'learning_rate': '5e-05', 'epoch': '0.1227', 'num_input_tokens_seen': 9972984, 'train_runtime': '5044', 'train_tokens_per_second': '1977'} +{'loss': '1.226', 'grad_norm': '1.697', 'learning_rate': '5e-05', 'epoch': '0.1227', 'num_input_tokens_seen': 9975031, 'train_runtime': '5045', 'train_tokens_per_second': '1977'} +{'loss': '0.728', 'grad_norm': '1.414', 'learning_rate': '5e-05', 'epoch': '0.1227', 'num_input_tokens_seen': 9977078, 'train_runtime': '5046', 'train_tokens_per_second': '1977'} +{'loss': '2.174', 'grad_norm': '2.603', 'learning_rate': '5e-05', 'epoch': '0.1227', 'num_input_tokens_seen': 9979125, 'train_runtime': '5047', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '1.711', 'learning_rate': '5e-05', 'epoch': '0.1228', 'num_input_tokens_seen': 9981172, 'train_runtime': '5048', 'train_tokens_per_second': '1977'} +{'loss': '0.6389', 'grad_norm': '1.776', 'learning_rate': '5e-05', 'epoch': '0.1228', 'num_input_tokens_seen': 9983219, 'train_runtime': '5049', 'train_tokens_per_second': '1977'} +{'loss': '1.511', 'grad_norm': '1.997', 'learning_rate': '5e-05', 'epoch': '0.1228', 'num_input_tokens_seen': 9985266, 'train_runtime': '5050', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '2.059', 'learning_rate': '5e-05', 'epoch': '0.1228', 'num_input_tokens_seen': 9987313, 'train_runtime': '5051', 'train_tokens_per_second': '1977'} +{'loss': '2.115', 'grad_norm': '2.5', 'learning_rate': '5e-05', 'epoch': '0.1229', 'num_input_tokens_seen': 9989360, 'train_runtime': '5052', 'train_tokens_per_second': '1977'} +{'loss': '1.305', 'grad_norm': '2.11', 'learning_rate': '5e-05', 'epoch': '0.1229', 'num_input_tokens_seen': 9991407, 'train_runtime': '5053', 'train_tokens_per_second': '1977'} +{'loss': '1.057', 'grad_norm': '1.772', 'learning_rate': '5e-05', 'epoch': '0.1229', 'num_input_tokens_seen': 9993454, 'train_runtime': '5054', 'train_tokens_per_second': '1977'} +{'loss': '0.7346', 'grad_norm': '1.795', 'learning_rate': '5e-05', 'epoch': '0.1229', 'num_input_tokens_seen': 9995501, 'train_runtime': '5055', 'train_tokens_per_second': '1977'} +{'loss': '0.7969', 'grad_norm': '1.32', 'learning_rate': '5e-05', 'epoch': '0.123', 'num_input_tokens_seen': 9997548, 'train_runtime': '5056', 'train_tokens_per_second': '1977'} +{'loss': '0.3872', 'grad_norm': '1.045', 'learning_rate': '5e-05', 'epoch': '0.123', 'num_input_tokens_seen': 9999595, 'train_runtime': '5057', 'train_tokens_per_second': '1977'} +{'loss': '0.4199', 'grad_norm': '1.122', 'learning_rate': '5e-05', 'epoch': '0.123', 'num_input_tokens_seen': 10001642, 'train_runtime': '5059', 'train_tokens_per_second': '1977'} +{'loss': '0.8243', 'grad_norm': '1.439', 'learning_rate': '5e-05', 'epoch': '0.123', 'num_input_tokens_seen': 10003689, 'train_runtime': '5060', 'train_tokens_per_second': '1977'} +{'loss': '0.9161', 'grad_norm': '1.759', 'learning_rate': '5e-05', 'epoch': '0.1231', 'num_input_tokens_seen': 10005736, 'train_runtime': '5061', 'train_tokens_per_second': '1977'} +{'loss': '0.7416', 'grad_norm': '1.369', 'learning_rate': '5e-05', 'epoch': '0.1231', 'num_input_tokens_seen': 10007783, 'train_runtime': '5062', 'train_tokens_per_second': '1977'} +{'loss': '1.507', 'grad_norm': '2.452', 'learning_rate': '5e-05', 'epoch': '0.1231', 'num_input_tokens_seen': 10009830, 'train_runtime': '5063', 'train_tokens_per_second': '1977'} +{'loss': '0.7881', 'grad_norm': '1.954', 'learning_rate': '5e-05', 'epoch': '0.1231', 'num_input_tokens_seen': 10011877, 'train_runtime': '5064', 'train_tokens_per_second': '1977'} +{'loss': '1.45', 'grad_norm': '2.304', 'learning_rate': '5e-05', 'epoch': '0.1232', 'num_input_tokens_seen': 10013924, 'train_runtime': '5065', 'train_tokens_per_second': '1977'} +{'loss': '1.144', 'grad_norm': '2.069', 'learning_rate': '5e-05', 'epoch': '0.1232', 'num_input_tokens_seen': 10015971, 'train_runtime': '5066', 'train_tokens_per_second': '1977'} +{'loss': '0.6915', 'grad_norm': '1.39', 'learning_rate': '5e-05', 'epoch': '0.1232', 'num_input_tokens_seen': 10018018, 'train_runtime': '5067', 'train_tokens_per_second': '1977'} +{'loss': '0.9944', 'grad_norm': '1.797', 'learning_rate': '5e-05', 'epoch': '0.1232', 'num_input_tokens_seen': 10020065, 'train_runtime': '5068', 'train_tokens_per_second': '1977'} +{'loss': '1.99', 'grad_norm': '3.154', 'learning_rate': '5e-05', 'epoch': '0.1233', 'num_input_tokens_seen': 10022112, 'train_runtime': '5069', 'train_tokens_per_second': '1977'} +{'loss': '1.28', 'grad_norm': '2.277', 'learning_rate': '5e-05', 'epoch': '0.1233', 'num_input_tokens_seen': 10024159, 'train_runtime': '5070', 'train_tokens_per_second': '1977'} +{'loss': '0.4649', 'grad_norm': '1.105', 'learning_rate': '5e-05', 'epoch': '0.1233', 'num_input_tokens_seen': 10026206, 'train_runtime': '5071', 'train_tokens_per_second': '1977'} +{'loss': '1.625', 'grad_norm': '1.763', 'learning_rate': '5e-05', 'epoch': '0.1233', 'num_input_tokens_seen': 10028253, 'train_runtime': '5072', 'train_tokens_per_second': '1977'} +{'loss': '0.4112', 'grad_norm': '1.01', 'learning_rate': '5e-05', 'epoch': '0.1234', 'num_input_tokens_seen': 10030300, 'train_runtime': '5073', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '1.711', 'learning_rate': '5e-05', 'epoch': '0.1234', 'num_input_tokens_seen': 10032347, 'train_runtime': '5074', 'train_tokens_per_second': '1977'} +{'loss': '0.5419', 'grad_norm': '1.27', 'learning_rate': '5e-05', 'epoch': '0.1234', 'num_input_tokens_seen': 10034394, 'train_runtime': '5075', 'train_tokens_per_second': '1977'} +{'loss': '1.771', 'grad_norm': '2.365', 'learning_rate': '5e-05', 'epoch': '0.1234', 'num_input_tokens_seen': 10036441, 'train_runtime': '5076', 'train_tokens_per_second': '1977'} +{'loss': '1.165', 'grad_norm': '1.839', 'learning_rate': '5e-05', 'epoch': '0.1235', 'num_input_tokens_seen': 10038488, 'train_runtime': '5077', 'train_tokens_per_second': '1977'} +{'loss': '2.084', 'grad_norm': '2.22', 'learning_rate': '5e-05', 'epoch': '0.1235', 'num_input_tokens_seen': 10040535, 'train_runtime': '5078', 'train_tokens_per_second': '1977'} +{'loss': '0.4161', 'grad_norm': '0.8381', 'learning_rate': '5e-05', 'epoch': '0.1235', 'num_input_tokens_seen': 10042582, 'train_runtime': '5079', 'train_tokens_per_second': '1977'} +{'loss': '0.3648', 'grad_norm': '1.197', 'learning_rate': '5e-05', 'epoch': '0.1235', 'num_input_tokens_seen': 10044629, 'train_runtime': '5080', 'train_tokens_per_second': '1977'} +{'loss': '1.885', 'grad_norm': '2.517', 'learning_rate': '5e-05', 'epoch': '0.1236', 'num_input_tokens_seen': 10046676, 'train_runtime': '5081', 'train_tokens_per_second': '1977'} +{'loss': '1.877', 'grad_norm': '2.394', 'learning_rate': '5e-05', 'epoch': '0.1236', 'num_input_tokens_seen': 10048723, 'train_runtime': '5082', 'train_tokens_per_second': '1977'} +{'loss': '0.4438', 'grad_norm': '1.206', 'learning_rate': '5e-05', 'epoch': '0.1236', 'num_input_tokens_seen': 10050770, 'train_runtime': '5083', 'train_tokens_per_second': '1977'} +{'loss': '0.5875', 'grad_norm': '1.292', 'learning_rate': '5e-05', 'epoch': '0.1236', 'num_input_tokens_seen': 10052817, 'train_runtime': '5084', 'train_tokens_per_second': '1977'} +{'loss': '1.809', 'grad_norm': '2.426', 'learning_rate': '5e-05', 'epoch': '0.1237', 'num_input_tokens_seen': 10054864, 'train_runtime': '5085', 'train_tokens_per_second': '1977'} +{'loss': '1.098', 'grad_norm': '1.84', 'learning_rate': '5e-05', 'epoch': '0.1237', 'num_input_tokens_seen': 10056911, 'train_runtime': '5086', 'train_tokens_per_second': '1977'} +{'loss': '0.3062', 'grad_norm': '1.179', 'learning_rate': '5e-05', 'epoch': '0.1237', 'num_input_tokens_seen': 10058958, 'train_runtime': '5088', 'train_tokens_per_second': '1977'} +{'loss': '0.6657', 'grad_norm': '1.635', 'learning_rate': '5e-05', 'epoch': '0.1238', 'num_input_tokens_seen': 10061005, 'train_runtime': '5089', 'train_tokens_per_second': '1977'} +{'loss': '0.6654', 'grad_norm': '1.579', 'learning_rate': '5e-05', 'epoch': '0.1238', 'num_input_tokens_seen': 10063052, 'train_runtime': '5090', 'train_tokens_per_second': '1977'} +{'loss': '1.062', 'grad_norm': '1.864', 'learning_rate': '5e-05', 'epoch': '0.1238', 'num_input_tokens_seen': 10065099, 'train_runtime': '5091', 'train_tokens_per_second': '1977'} +{'loss': '0.9251', 'grad_norm': '1.854', 'learning_rate': '5e-05', 'epoch': '0.1238', 'num_input_tokens_seen': 10067146, 'train_runtime': '5092', 'train_tokens_per_second': '1977'} +{'loss': '1.38', 'grad_norm': '2.248', 'learning_rate': '5e-05', 'epoch': '0.1239', 'num_input_tokens_seen': 10069193, 'train_runtime': '5093', 'train_tokens_per_second': '1977'} +{'loss': '1.518', 'grad_norm': '2.203', 'learning_rate': '5e-05', 'epoch': '0.1239', 'num_input_tokens_seen': 10071240, 'train_runtime': '5094', 'train_tokens_per_second': '1977'} +{'loss': '1.561', 'grad_norm': '2.389', 'learning_rate': '5e-05', 'epoch': '0.1239', 'num_input_tokens_seen': 10073287, 'train_runtime': '5095', 'train_tokens_per_second': '1977'} +{'loss': '2.286', 'grad_norm': '2.684', 'learning_rate': '5e-05', 'epoch': '0.1239', 'num_input_tokens_seen': 10075334, 'train_runtime': '5096', 'train_tokens_per_second': '1977'} +{'loss': '2.069', 'grad_norm': '2.287', 'learning_rate': '5e-05', 'epoch': '0.124', 'num_input_tokens_seen': 10077381, 'train_runtime': '5097', 'train_tokens_per_second': '1977'} +{'loss': '0.6277', 'grad_norm': '1.507', 'learning_rate': '5e-05', 'epoch': '0.124', 'num_input_tokens_seen': 10079428, 'train_runtime': '5098', 'train_tokens_per_second': '1977'} +{'loss': '0.8734', 'grad_norm': '1.717', 'learning_rate': '5e-05', 'epoch': '0.124', 'num_input_tokens_seen': 10081475, 'train_runtime': '5099', 'train_tokens_per_second': '1977'} +{'loss': '0.3456', 'grad_norm': '1.098', 'learning_rate': '5e-05', 'epoch': '0.124', 'num_input_tokens_seen': 10083522, 'train_runtime': '5100', 'train_tokens_per_second': '1977'} +{'loss': '0.9014', 'grad_norm': '1.823', 'learning_rate': '5e-05', 'epoch': '0.1241', 'num_input_tokens_seen': 10085569, 'train_runtime': '5101', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '1.762', 'learning_rate': '5e-05', 'epoch': '0.1241', 'num_input_tokens_seen': 10087616, 'train_runtime': '5102', 'train_tokens_per_second': '1977'} +{'loss': '0.7621', 'grad_norm': '1.646', 'learning_rate': '5e-05', 'epoch': '0.1241', 'num_input_tokens_seen': 10089663, 'train_runtime': '5103', 'train_tokens_per_second': '1977'} +{'loss': '0.7551', 'grad_norm': '1.544', 'learning_rate': '5e-05', 'epoch': '0.1241', 'num_input_tokens_seen': 10091710, 'train_runtime': '5104', 'train_tokens_per_second': '1977'} +{'loss': '1.59', 'grad_norm': '2.137', 'learning_rate': '5e-05', 'epoch': '0.1242', 'num_input_tokens_seen': 10093757, 'train_runtime': '5105', 'train_tokens_per_second': '1977'} +{'loss': '3.383', 'grad_norm': '2.536', 'learning_rate': '5e-05', 'epoch': '0.1242', 'num_input_tokens_seen': 10095804, 'train_runtime': '5106', 'train_tokens_per_second': '1977'} +{'loss': '0.8517', 'grad_norm': '1.769', 'learning_rate': '5e-05', 'epoch': '0.1242', 'num_input_tokens_seen': 10097851, 'train_runtime': '5107', 'train_tokens_per_second': '1977'} +{'loss': '0.8686', 'grad_norm': '1.514', 'learning_rate': '5e-05', 'epoch': '0.1242', 'num_input_tokens_seen': 10099898, 'train_runtime': '5108', 'train_tokens_per_second': '1977'} +{'loss': '0.9643', 'grad_norm': '1.63', 'learning_rate': '5e-05', 'epoch': '0.1243', 'num_input_tokens_seen': 10101945, 'train_runtime': '5109', 'train_tokens_per_second': '1977'} +{'loss': '0.6084', 'grad_norm': '1.336', 'learning_rate': '5e-05', 'epoch': '0.1243', 'num_input_tokens_seen': 10103992, 'train_runtime': '5110', 'train_tokens_per_second': '1977'} +{'loss': '0.6879', 'grad_norm': '1.451', 'learning_rate': '5e-05', 'epoch': '0.1243', 'num_input_tokens_seen': 10106039, 'train_runtime': '5111', 'train_tokens_per_second': '1977'} +{'loss': '0.7065', 'grad_norm': '1.261', 'learning_rate': '5e-05', 'epoch': '0.1243', 'num_input_tokens_seen': 10108086, 'train_runtime': '5112', 'train_tokens_per_second': '1977'} +{'loss': '0.3865', 'grad_norm': '0.987', 'learning_rate': '5e-05', 'epoch': '0.1244', 'num_input_tokens_seen': 10110133, 'train_runtime': '5113', 'train_tokens_per_second': '1977'} +{'loss': '0.4796', 'grad_norm': '1.199', 'learning_rate': '5e-05', 'epoch': '0.1244', 'num_input_tokens_seen': 10112180, 'train_runtime': '5114', 'train_tokens_per_second': '1977'} +{'loss': '0.8764', 'grad_norm': '1.88', 'learning_rate': '5e-05', 'epoch': '0.1244', 'num_input_tokens_seen': 10114227, 'train_runtime': '5115', 'train_tokens_per_second': '1977'} +{'loss': '0.7753', 'grad_norm': '1.799', 'learning_rate': '5e-05', 'epoch': '0.1244', 'num_input_tokens_seen': 10116274, 'train_runtime': '5117', 'train_tokens_per_second': '1977'} +{'loss': '0.9736', 'grad_norm': '2.044', 'learning_rate': '5e-05', 'epoch': '0.1245', 'num_input_tokens_seen': 10118321, 'train_runtime': '5118', 'train_tokens_per_second': '1977'} +{'loss': '0.3373', 'grad_norm': '1.02', 'learning_rate': '5e-05', 'epoch': '0.1245', 'num_input_tokens_seen': 10120368, 'train_runtime': '5119', 'train_tokens_per_second': '1977'} +{'loss': '1.429', 'grad_norm': '2.215', 'learning_rate': '5e-05', 'epoch': '0.1245', 'num_input_tokens_seen': 10122415, 'train_runtime': '5120', 'train_tokens_per_second': '1977'} +{'loss': '0.7292', 'grad_norm': '1.573', 'learning_rate': '5e-05', 'epoch': '0.1245', 'num_input_tokens_seen': 10124462, 'train_runtime': '5121', 'train_tokens_per_second': '1977'} +{'loss': '0.7694', 'grad_norm': '1.632', 'learning_rate': '5e-05', 'epoch': '0.1246', 'num_input_tokens_seen': 10126509, 'train_runtime': '5122', 'train_tokens_per_second': '1977'} +{'loss': '1.883', 'grad_norm': '2.479', 'learning_rate': '5e-05', 'epoch': '0.1246', 'num_input_tokens_seen': 10128556, 'train_runtime': '5123', 'train_tokens_per_second': '1977'} +{'loss': '0.2728', 'grad_norm': '0.9703', 'learning_rate': '5e-05', 'epoch': '0.1246', 'num_input_tokens_seen': 10130603, 'train_runtime': '5124', 'train_tokens_per_second': '1977'} +{'loss': '0.5723', 'grad_norm': '1.135', 'learning_rate': '5e-05', 'epoch': '0.1246', 'num_input_tokens_seen': 10132650, 'train_runtime': '5125', 'train_tokens_per_second': '1977'} +{'loss': '2.249', 'grad_norm': '3.198', 'learning_rate': '5e-05', 'epoch': '0.1247', 'num_input_tokens_seen': 10134697, 'train_runtime': '5126', 'train_tokens_per_second': '1977'} +{'loss': '0.7764', 'grad_norm': '1.715', 'learning_rate': '5e-05', 'epoch': '0.1247', 'num_input_tokens_seen': 10136744, 'train_runtime': '5127', 'train_tokens_per_second': '1977'} +{'loss': '1.443', 'grad_norm': '2.666', 'learning_rate': '5e-05', 'epoch': '0.1247', 'num_input_tokens_seen': 10138791, 'train_runtime': '5128', 'train_tokens_per_second': '1977'} +{'loss': '1.66', 'grad_norm': '1.95', 'learning_rate': '5e-05', 'epoch': '0.1247', 'num_input_tokens_seen': 10140838, 'train_runtime': '5129', 'train_tokens_per_second': '1977'} +{'loss': '0.9764', 'grad_norm': '1.833', 'learning_rate': '5e-05', 'epoch': '0.1248', 'num_input_tokens_seen': 10142885, 'train_runtime': '5130', 'train_tokens_per_second': '1977'} +{'loss': '0.9892', 'grad_norm': '1.66', 'learning_rate': '5e-05', 'epoch': '0.1248', 'num_input_tokens_seen': 10144932, 'train_runtime': '5131', 'train_tokens_per_second': '1977'} +{'loss': '1', 'grad_norm': '1.853', 'learning_rate': '5e-05', 'epoch': '0.1248', 'num_input_tokens_seen': 10146979, 'train_runtime': '5132', 'train_tokens_per_second': '1977'} +{'loss': '0.438', 'grad_norm': '1.267', 'learning_rate': '5e-05', 'epoch': '0.1248', 'num_input_tokens_seen': 10149026, 'train_runtime': '5133', 'train_tokens_per_second': '1977'} +{'loss': '0.4368', 'grad_norm': '1.216', 'learning_rate': '5e-05', 'epoch': '0.1249', 'num_input_tokens_seen': 10151073, 'train_runtime': '5134', 'train_tokens_per_second': '1977'} +{'loss': '0.4278', 'grad_norm': '1.264', 'learning_rate': '5e-05', 'epoch': '0.1249', 'num_input_tokens_seen': 10153120, 'train_runtime': '5135', 'train_tokens_per_second': '1977'} +{'loss': '0.664', 'grad_norm': '1.483', 'learning_rate': '5e-05', 'epoch': '0.1249', 'num_input_tokens_seen': 10155167, 'train_runtime': '5136', 'train_tokens_per_second': '1977'} +{'loss': '1.289', 'grad_norm': '1.942', 'learning_rate': '5e-05', 'epoch': '0.1249', 'num_input_tokens_seen': 10157214, 'train_runtime': '5137', 'train_tokens_per_second': '1977'} +{'loss': '0.4181', 'grad_norm': '1.301', 'learning_rate': '5e-05', 'epoch': '0.125', 'num_input_tokens_seen': 10159261, 'train_runtime': '5138', 'train_tokens_per_second': '1977'} +{'loss': '0.3452', 'grad_norm': '0.9879', 'learning_rate': '5e-05', 'epoch': '0.125', 'num_input_tokens_seen': 10161308, 'train_runtime': '5139', 'train_tokens_per_second': '1977'} +{'loss': '1.204', 'grad_norm': '1.772', 'learning_rate': '5e-05', 'epoch': '0.125', 'num_input_tokens_seen': 10163355, 'train_runtime': '5140', 'train_tokens_per_second': '1977'} +{'loss': '0.9076', 'grad_norm': '1.632', 'learning_rate': '5e-05', 'epoch': '0.125', 'num_input_tokens_seen': 10165402, 'train_runtime': '5141', 'train_tokens_per_second': '1977'} +{'loss': '0.5624', 'grad_norm': '1.596', 'learning_rate': '5e-05', 'epoch': '0.1251', 'num_input_tokens_seen': 10167449, 'train_runtime': '5142', 'train_tokens_per_second': '1977'} +{'loss': '1.707', 'grad_norm': '2.755', 'learning_rate': '5e-05', 'epoch': '0.1251', 'num_input_tokens_seen': 10169496, 'train_runtime': '5143', 'train_tokens_per_second': '1977'} +{'loss': '0.6798', 'grad_norm': '1.366', 'learning_rate': '5e-05', 'epoch': '0.1251', 'num_input_tokens_seen': 10171543, 'train_runtime': '5145', 'train_tokens_per_second': '1977'} +{'loss': '0.8776', 'grad_norm': '1.522', 'learning_rate': '5e-05', 'epoch': '0.1251', 'num_input_tokens_seen': 10173590, 'train_runtime': '5146', 'train_tokens_per_second': '1977'} +{'loss': '0.3105', 'grad_norm': '1.349', 'learning_rate': '5e-05', 'epoch': '0.1252', 'num_input_tokens_seen': 10175637, 'train_runtime': '5147', 'train_tokens_per_second': '1977'} +{'loss': '0.4144', 'grad_norm': '0.8593', 'learning_rate': '5e-05', 'epoch': '0.1252', 'num_input_tokens_seen': 10177684, 'train_runtime': '5148', 'train_tokens_per_second': '1977'} +{'loss': '0.4829', 'grad_norm': '1.272', 'learning_rate': '5e-05', 'epoch': '0.1252', 'num_input_tokens_seen': 10179731, 'train_runtime': '5149', 'train_tokens_per_second': '1977'} +{'loss': '0.6676', 'grad_norm': '1.46', 'learning_rate': '5e-05', 'epoch': '0.1252', 'num_input_tokens_seen': 10181778, 'train_runtime': '5150', 'train_tokens_per_second': '1977'} +{'loss': '1.175', 'grad_norm': '1.647', 'learning_rate': '5e-05', 'epoch': '0.1253', 'num_input_tokens_seen': 10183825, 'train_runtime': '5151', 'train_tokens_per_second': '1977'} +{'loss': '2.55', 'grad_norm': '2.419', 'learning_rate': '5e-05', 'epoch': '0.1253', 'num_input_tokens_seen': 10185872, 'train_runtime': '5152', 'train_tokens_per_second': '1977'} +{'loss': '0.3372', 'grad_norm': '0.9463', 'learning_rate': '5e-05', 'epoch': '0.1253', 'num_input_tokens_seen': 10187919, 'train_runtime': '5153', 'train_tokens_per_second': '1977'} +{'loss': '0.3903', 'grad_norm': '1.236', 'learning_rate': '5e-05', 'epoch': '0.1253', 'num_input_tokens_seen': 10189966, 'train_runtime': '5154', 'train_tokens_per_second': '1977'} +{'loss': '0.4016', 'grad_norm': '1.033', 'learning_rate': '5e-05', 'epoch': '0.1254', 'num_input_tokens_seen': 10192013, 'train_runtime': '5155', 'train_tokens_per_second': '1977'} +{'loss': '0.4805', 'grad_norm': '1.48', 'learning_rate': '5e-05', 'epoch': '0.1254', 'num_input_tokens_seen': 10194060, 'train_runtime': '5156', 'train_tokens_per_second': '1977'} +{'loss': '0.369', 'grad_norm': '1.216', 'learning_rate': '5e-05', 'epoch': '0.1254', 'num_input_tokens_seen': 10196107, 'train_runtime': '5157', 'train_tokens_per_second': '1977'} +{'loss': '1.176', 'grad_norm': '1.719', 'learning_rate': '5e-05', 'epoch': '0.1254', 'num_input_tokens_seen': 10198154, 'train_runtime': '5158', 'train_tokens_per_second': '1977'} +{'loss': '0.4287', 'grad_norm': '1.352', 'learning_rate': '5e-05', 'epoch': '0.1255', 'num_input_tokens_seen': 10200201, 'train_runtime': '5159', 'train_tokens_per_second': '1977'} +{'loss': '0.527', 'grad_norm': '1.607', 'learning_rate': '5e-05', 'epoch': '0.1255', 'num_input_tokens_seen': 10202248, 'train_runtime': '5160', 'train_tokens_per_second': '1977'} +{'loss': '0.8012', 'grad_norm': '1.808', 'learning_rate': '5e-05', 'epoch': '0.1255', 'num_input_tokens_seen': 10204295, 'train_runtime': '5161', 'train_tokens_per_second': '1977'} +{'loss': '0.54', 'grad_norm': '1.439', 'learning_rate': '5e-05', 'epoch': '0.1255', 'num_input_tokens_seen': 10206342, 'train_runtime': '5162', 'train_tokens_per_second': '1977'} +{'loss': '0.6951', 'grad_norm': '1.76', 'learning_rate': '5e-05', 'epoch': '0.1256', 'num_input_tokens_seen': 10208389, 'train_runtime': '5163', 'train_tokens_per_second': '1977'} +{'loss': '0.8308', 'grad_norm': '1.286', 'learning_rate': '5e-05', 'epoch': '0.1256', 'num_input_tokens_seen': 10210436, 'train_runtime': '5164', 'train_tokens_per_second': '1977'} +{'loss': '0.8887', 'grad_norm': '1.852', 'learning_rate': '5e-05', 'epoch': '0.1256', 'num_input_tokens_seen': 10212483, 'train_runtime': '5165', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '2.243', 'learning_rate': '5e-05', 'epoch': '0.1256', 'num_input_tokens_seen': 10214530, 'train_runtime': '5166', 'train_tokens_per_second': '1977'} +{'loss': '1.414', 'grad_norm': '2.948', 'learning_rate': '5e-05', 'epoch': '0.1257', 'num_input_tokens_seen': 10216577, 'train_runtime': '5167', 'train_tokens_per_second': '1977'} +{'loss': '1.161', 'grad_norm': '2.085', 'learning_rate': '5e-05', 'epoch': '0.1257', 'num_input_tokens_seen': 10218624, 'train_runtime': '5168', 'train_tokens_per_second': '1977'} +{'loss': '1.125', 'grad_norm': '1.647', 'learning_rate': '5e-05', 'epoch': '0.1257', 'num_input_tokens_seen': 10220671, 'train_runtime': '5169', 'train_tokens_per_second': '1977'} +{'loss': '0.464', 'grad_norm': '1.362', 'learning_rate': '5e-05', 'epoch': '0.1257', 'num_input_tokens_seen': 10222718, 'train_runtime': '5170', 'train_tokens_per_second': '1977'} +{'loss': '0.5411', 'grad_norm': '1.509', 'learning_rate': '5e-05', 'epoch': '0.1258', 'num_input_tokens_seen': 10224765, 'train_runtime': '5171', 'train_tokens_per_second': '1977'} +{'loss': '0.77', 'grad_norm': '1.757', 'learning_rate': '5e-05', 'epoch': '0.1258', 'num_input_tokens_seen': 10226812, 'train_runtime': '5172', 'train_tokens_per_second': '1977'} +{'loss': '1.597', 'grad_norm': '2.342', 'learning_rate': '5e-05', 'epoch': '0.1258', 'num_input_tokens_seen': 10228859, 'train_runtime': '5173', 'train_tokens_per_second': '1977'} +{'loss': '0.9808', 'grad_norm': '1.735', 'learning_rate': '5e-05', 'epoch': '0.1258', 'num_input_tokens_seen': 10230906, 'train_runtime': '5175', 'train_tokens_per_second': '1977'} +{'loss': '1.172', 'grad_norm': '2.47', 'learning_rate': '5e-05', 'epoch': '0.1259', 'num_input_tokens_seen': 10232953, 'train_runtime': '5176', 'train_tokens_per_second': '1977'} +{'loss': '0.4913', 'grad_norm': '1.095', 'learning_rate': '5e-05', 'epoch': '0.1259', 'num_input_tokens_seen': 10235000, 'train_runtime': '5177', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 04:03:42,106 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 04:03:42,106 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 04:03:42,496 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-5000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 04:03:42,509 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-5000/tokenizer_config.json + +{'loss': '1.399', 'grad_norm': '2.466', 'learning_rate': '5e-05', 'epoch': '0.1259', 'num_input_tokens_seen': 10237047, 'train_runtime': '5178', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '1.916', 'learning_rate': '5e-05', 'epoch': '0.1259', 'num_input_tokens_seen': 10239094, 'train_runtime': '5179', 'train_tokens_per_second': '1977'} +{'loss': '0.2296', 'grad_norm': '0.9774', 'learning_rate': '5e-05', 'epoch': '0.126', 'num_input_tokens_seen': 10241141, 'train_runtime': '5180', 'train_tokens_per_second': '1977'} +{'loss': '2.26', 'grad_norm': '3.401', 'learning_rate': '5e-05', 'epoch': '0.126', 'num_input_tokens_seen': 10243188, 'train_runtime': '5181', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '1.588', 'learning_rate': '5e-05', 'epoch': '0.126', 'num_input_tokens_seen': 10245235, 'train_runtime': '5182', 'train_tokens_per_second': '1977'} +{'loss': '0.9658', 'grad_norm': '1.756', 'learning_rate': '5e-05', 'epoch': '0.126', 'num_input_tokens_seen': 10247282, 'train_runtime': '5183', 'train_tokens_per_second': '1977'} +{'loss': '0.7903', 'grad_norm': '1.536', 'learning_rate': '5e-05', 'epoch': '0.1261', 'num_input_tokens_seen': 10249329, 'train_runtime': '5184', 'train_tokens_per_second': '1977'} +{'loss': '1.15', 'grad_norm': '1.903', 'learning_rate': '5e-05', 'epoch': '0.1261', 'num_input_tokens_seen': 10251376, 'train_runtime': '5185', 'train_tokens_per_second': '1977'} +{'loss': '0.8275', 'grad_norm': '1.506', 'learning_rate': '5e-05', 'epoch': '0.1261', 'num_input_tokens_seen': 10253423, 'train_runtime': '5187', 'train_tokens_per_second': '1977'} +{'loss': '0.4193', 'grad_norm': '1.036', 'learning_rate': '5e-05', 'epoch': '0.1261', 'num_input_tokens_seen': 10255470, 'train_runtime': '5188', 'train_tokens_per_second': '1977'} +{'loss': '0.7866', 'grad_norm': '1.882', 'learning_rate': '5e-05', 'epoch': '0.1262', 'num_input_tokens_seen': 10257517, 'train_runtime': '5189', 'train_tokens_per_second': '1977'} +{'loss': '0.9549', 'grad_norm': '1.965', 'learning_rate': '5e-05', 'epoch': '0.1262', 'num_input_tokens_seen': 10259564, 'train_runtime': '5190', 'train_tokens_per_second': '1977'} +{'loss': '0.3661', 'grad_norm': '1.269', 'learning_rate': '5e-05', 'epoch': '0.1262', 'num_input_tokens_seen': 10261611, 'train_runtime': '5191', 'train_tokens_per_second': '1977'} +{'loss': '0.3649', 'grad_norm': '1.088', 'learning_rate': '5e-05', 'epoch': '0.1262', 'num_input_tokens_seen': 10263658, 'train_runtime': '5192', 'train_tokens_per_second': '1977'} +{'loss': '0.738', 'grad_norm': '1.561', 'learning_rate': '5e-05', 'epoch': '0.1263', 'num_input_tokens_seen': 10265705, 'train_runtime': '5193', 'train_tokens_per_second': '1977'} +{'loss': '0.7742', 'grad_norm': '1.545', 'learning_rate': '5e-05', 'epoch': '0.1263', 'num_input_tokens_seen': 10267752, 'train_runtime': '5194', 'train_tokens_per_second': '1977'} +{'loss': '1.223', 'grad_norm': '2.262', 'learning_rate': '5e-05', 'epoch': '0.1263', 'num_input_tokens_seen': 10269799, 'train_runtime': '5195', 'train_tokens_per_second': '1977'} +{'loss': '0.6248', 'grad_norm': '1.441', 'learning_rate': '5e-05', 'epoch': '0.1263', 'num_input_tokens_seen': 10271846, 'train_runtime': '5196', 'train_tokens_per_second': '1977'} +{'loss': '0.5284', 'grad_norm': '1.327', 'learning_rate': '5e-05', 'epoch': '0.1264', 'num_input_tokens_seen': 10273893, 'train_runtime': '5197', 'train_tokens_per_second': '1977'} +{'loss': '1.177', 'grad_norm': '1.92', 'learning_rate': '5e-05', 'epoch': '0.1264', 'num_input_tokens_seen': 10275940, 'train_runtime': '5198', 'train_tokens_per_second': '1977'} +{'loss': '0.6774', 'grad_norm': '1.843', 'learning_rate': '5e-05', 'epoch': '0.1264', 'num_input_tokens_seen': 10277987, 'train_runtime': '5199', 'train_tokens_per_second': '1977'} +{'loss': '1.229', 'grad_norm': '2.502', 'learning_rate': '5e-05', 'epoch': '0.1264', 'num_input_tokens_seen': 10280034, 'train_runtime': '5200', 'train_tokens_per_second': '1977'} +{'loss': '0.4699', 'grad_norm': '1.304', 'learning_rate': '5e-05', 'epoch': '0.1265', 'num_input_tokens_seen': 10282081, 'train_runtime': '5201', 'train_tokens_per_second': '1977'} +{'loss': '0.6902', 'grad_norm': '1.698', 'learning_rate': '5e-05', 'epoch': '0.1265', 'num_input_tokens_seen': 10284128, 'train_runtime': '5202', 'train_tokens_per_second': '1977'} +{'loss': '1.86', 'grad_norm': '2.798', 'learning_rate': '5e-05', 'epoch': '0.1265', 'num_input_tokens_seen': 10286175, 'train_runtime': '5203', 'train_tokens_per_second': '1977'} +{'loss': '0.6875', 'grad_norm': '1.659', 'learning_rate': '5e-05', 'epoch': '0.1265', 'num_input_tokens_seen': 10288222, 'train_runtime': '5204', 'train_tokens_per_second': '1977'} +{'loss': '0.4187', 'grad_norm': '1.241', 'learning_rate': '5e-05', 'epoch': '0.1266', 'num_input_tokens_seen': 10290269, 'train_runtime': '5205', 'train_tokens_per_second': '1977'} +{'loss': '0.9951', 'grad_norm': '1.515', 'learning_rate': '5e-05', 'epoch': '0.1266', 'num_input_tokens_seen': 10292316, 'train_runtime': '5206', 'train_tokens_per_second': '1977'} +{'loss': '0.838', 'grad_norm': '1.751', 'learning_rate': '5e-05', 'epoch': '0.1266', 'num_input_tokens_seen': 10294363, 'train_runtime': '5207', 'train_tokens_per_second': '1977'} +{'loss': '0.3415', 'grad_norm': '1.333', 'learning_rate': '5e-05', 'epoch': '0.1266', 'num_input_tokens_seen': 10296410, 'train_runtime': '5208', 'train_tokens_per_second': '1977'} +{'loss': '0.6555', 'grad_norm': '1.065', 'learning_rate': '5e-05', 'epoch': '0.1267', 'num_input_tokens_seen': 10298457, 'train_runtime': '5209', 'train_tokens_per_second': '1977'} +{'loss': '1.174', 'grad_norm': '1.661', 'learning_rate': '5e-05', 'epoch': '0.1267', 'num_input_tokens_seen': 10300504, 'train_runtime': '5210', 'train_tokens_per_second': '1977'} +{'loss': '1.031', 'grad_norm': '2.083', 'learning_rate': '5e-05', 'epoch': '0.1267', 'num_input_tokens_seen': 10302551, 'train_runtime': '5211', 'train_tokens_per_second': '1977'} +{'loss': '0.7976', 'grad_norm': '1.732', 'learning_rate': '5e-05', 'epoch': '0.1267', 'num_input_tokens_seen': 10304598, 'train_runtime': '5212', 'train_tokens_per_second': '1977'} +{'loss': '0.9591', 'grad_norm': '1.706', 'learning_rate': '5e-05', 'epoch': '0.1268', 'num_input_tokens_seen': 10306645, 'train_runtime': '5213', 'train_tokens_per_second': '1977'} +{'loss': '0.9912', 'grad_norm': '1.828', 'learning_rate': '5e-05', 'epoch': '0.1268', 'num_input_tokens_seen': 10308692, 'train_runtime': '5215', 'train_tokens_per_second': '1977'} +{'loss': '1.198', 'grad_norm': '1.709', 'learning_rate': '5e-05', 'epoch': '0.1268', 'num_input_tokens_seen': 10310739, 'train_runtime': '5216', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '1.857', 'learning_rate': '5e-05', 'epoch': '0.1268', 'num_input_tokens_seen': 10312786, 'train_runtime': '5217', 'train_tokens_per_second': '1977'} +{'loss': '0.9702', 'grad_norm': '1.46', 'learning_rate': '5e-05', 'epoch': '0.1269', 'num_input_tokens_seen': 10314833, 'train_runtime': '5218', 'train_tokens_per_second': '1977'} +{'loss': '0.2616', 'grad_norm': '0.9209', 'learning_rate': '5e-05', 'epoch': '0.1269', 'num_input_tokens_seen': 10316880, 'train_runtime': '5219', 'train_tokens_per_second': '1977'} +{'loss': '0.5645', 'grad_norm': '1.236', 'learning_rate': '5e-05', 'epoch': '0.1269', 'num_input_tokens_seen': 10318927, 'train_runtime': '5220', 'train_tokens_per_second': '1977'} +{'loss': '0.5029', 'grad_norm': '1.272', 'learning_rate': '5e-05', 'epoch': '0.1269', 'num_input_tokens_seen': 10320974, 'train_runtime': '5221', 'train_tokens_per_second': '1977'} +{'loss': '0.9587', 'grad_norm': '1.862', 'learning_rate': '5e-05', 'epoch': '0.127', 'num_input_tokens_seen': 10323021, 'train_runtime': '5222', 'train_tokens_per_second': '1977'} +{'loss': '0.7082', 'grad_norm': '1.4', 'learning_rate': '5e-05', 'epoch': '0.127', 'num_input_tokens_seen': 10325068, 'train_runtime': '5223', 'train_tokens_per_second': '1977'} +{'loss': '0.5302', 'grad_norm': '1.277', 'learning_rate': '5e-05', 'epoch': '0.127', 'num_input_tokens_seen': 10327115, 'train_runtime': '5224', 'train_tokens_per_second': '1977'} +{'loss': '0.713', 'grad_norm': '1.587', 'learning_rate': '5e-05', 'epoch': '0.127', 'num_input_tokens_seen': 10329162, 'train_runtime': '5225', 'train_tokens_per_second': '1977'} +{'loss': '0.3098', 'grad_norm': '1.018', 'learning_rate': '5e-05', 'epoch': '0.1271', 'num_input_tokens_seen': 10331209, 'train_runtime': '5226', 'train_tokens_per_second': '1977'} +{'loss': '2.433', 'grad_norm': '2.299', 'learning_rate': '5e-05', 'epoch': '0.1271', 'num_input_tokens_seen': 10333256, 'train_runtime': '5227', 'train_tokens_per_second': '1977'} +{'loss': '0.4278', 'grad_norm': '1.079', 'learning_rate': '5e-05', 'epoch': '0.1271', 'num_input_tokens_seen': 10335303, 'train_runtime': '5228', 'train_tokens_per_second': '1977'} +{'loss': '1.586', 'grad_norm': '1.972', 'learning_rate': '5e-05', 'epoch': '0.1271', 'num_input_tokens_seen': 10337350, 'train_runtime': '5229', 'train_tokens_per_second': '1977'} +{'loss': '1.371', 'grad_norm': '1.899', 'learning_rate': '5e-05', 'epoch': '0.1272', 'num_input_tokens_seen': 10339397, 'train_runtime': '5230', 'train_tokens_per_second': '1977'} +{'loss': '1.676', 'grad_norm': '1.963', 'learning_rate': '5e-05', 'epoch': '0.1272', 'num_input_tokens_seen': 10341444, 'train_runtime': '5231', 'train_tokens_per_second': '1977'} +{'loss': '0.7467', 'grad_norm': '1.747', 'learning_rate': '5e-05', 'epoch': '0.1272', 'num_input_tokens_seen': 10343491, 'train_runtime': '5232', 'train_tokens_per_second': '1977'} +{'loss': '0.7744', 'grad_norm': '1.633', 'learning_rate': '5e-05', 'epoch': '0.1273', 'num_input_tokens_seen': 10345538, 'train_runtime': '5233', 'train_tokens_per_second': '1977'} +{'loss': '0.6019', 'grad_norm': '1.17', 'learning_rate': '5e-05', 'epoch': '0.1273', 'num_input_tokens_seen': 10347585, 'train_runtime': '5234', 'train_tokens_per_second': '1977'} +{'loss': '1.524', 'grad_norm': '2.165', 'learning_rate': '5e-05', 'epoch': '0.1273', 'num_input_tokens_seen': 10349632, 'train_runtime': '5235', 'train_tokens_per_second': '1977'} +{'loss': '0.8991', 'grad_norm': '1.736', 'learning_rate': '5e-05', 'epoch': '0.1273', 'num_input_tokens_seen': 10351679, 'train_runtime': '5236', 'train_tokens_per_second': '1977'} +{'loss': '0.7435', 'grad_norm': '1.28', 'learning_rate': '5e-05', 'epoch': '0.1274', 'num_input_tokens_seen': 10353726, 'train_runtime': '5237', 'train_tokens_per_second': '1977'} +{'loss': '0.7224', 'grad_norm': '1.364', 'learning_rate': '5e-05', 'epoch': '0.1274', 'num_input_tokens_seen': 10355773, 'train_runtime': '5238', 'train_tokens_per_second': '1977'} +{'loss': '0.939', 'grad_norm': '1.629', 'learning_rate': '5e-05', 'epoch': '0.1274', 'num_input_tokens_seen': 10357820, 'train_runtime': '5239', 'train_tokens_per_second': '1977'} +{'loss': '0.7357', 'grad_norm': '1.482', 'learning_rate': '5e-05', 'epoch': '0.1274', 'num_input_tokens_seen': 10359867, 'train_runtime': '5240', 'train_tokens_per_second': '1977'} +{'loss': '0.3791', 'grad_norm': '1.101', 'learning_rate': '5e-05', 'epoch': '0.1275', 'num_input_tokens_seen': 10361914, 'train_runtime': '5241', 'train_tokens_per_second': '1977'} +{'loss': '1.351', 'grad_norm': '1.72', 'learning_rate': '5e-05', 'epoch': '0.1275', 'num_input_tokens_seen': 10363961, 'train_runtime': '5242', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '2.238', 'learning_rate': '5e-05', 'epoch': '0.1275', 'num_input_tokens_seen': 10366008, 'train_runtime': '5243', 'train_tokens_per_second': '1977'} +{'loss': '1.135', 'grad_norm': '2.12', 'learning_rate': '5e-05', 'epoch': '0.1275', 'num_input_tokens_seen': 10368055, 'train_runtime': '5245', 'train_tokens_per_second': '1977'} +{'loss': '0.5996', 'grad_norm': '1.194', 'learning_rate': '5e-05', 'epoch': '0.1276', 'num_input_tokens_seen': 10370102, 'train_runtime': '5246', 'train_tokens_per_second': '1977'} +{'loss': '0.7198', 'grad_norm': '1.352', 'learning_rate': '5e-05', 'epoch': '0.1276', 'num_input_tokens_seen': 10372149, 'train_runtime': '5247', 'train_tokens_per_second': '1977'} +{'loss': '1.124', 'grad_norm': '1.951', 'learning_rate': '5e-05', 'epoch': '0.1276', 'num_input_tokens_seen': 10374196, 'train_runtime': '5248', 'train_tokens_per_second': '1977'} +{'loss': '0.3331', 'grad_norm': '1.096', 'learning_rate': '5e-05', 'epoch': '0.1276', 'num_input_tokens_seen': 10376243, 'train_runtime': '5249', 'train_tokens_per_second': '1977'} +{'loss': '0.8112', 'grad_norm': '1.564', 'learning_rate': '5e-05', 'epoch': '0.1277', 'num_input_tokens_seen': 10378290, 'train_runtime': '5250', 'train_tokens_per_second': '1977'} +{'loss': '0.7302', 'grad_norm': '1.395', 'learning_rate': '5e-05', 'epoch': '0.1277', 'num_input_tokens_seen': 10380337, 'train_runtime': '5251', 'train_tokens_per_second': '1977'} +{'loss': '0.9953', 'grad_norm': '1.403', 'learning_rate': '5e-05', 'epoch': '0.1277', 'num_input_tokens_seen': 10382384, 'train_runtime': '5252', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '2.167', 'learning_rate': '5e-05', 'epoch': '0.1277', 'num_input_tokens_seen': 10384431, 'train_runtime': '5253', 'train_tokens_per_second': '1977'} +{'loss': '1.95', 'grad_norm': '2.428', 'learning_rate': '5e-05', 'epoch': '0.1278', 'num_input_tokens_seen': 10386478, 'train_runtime': '5254', 'train_tokens_per_second': '1977'} +{'loss': '1.659', 'grad_norm': '2.519', 'learning_rate': '5e-05', 'epoch': '0.1278', 'num_input_tokens_seen': 10388525, 'train_runtime': '5255', 'train_tokens_per_second': '1977'} +{'loss': '1.735', 'grad_norm': '2.069', 'learning_rate': '5e-05', 'epoch': '0.1278', 'num_input_tokens_seen': 10390572, 'train_runtime': '5256', 'train_tokens_per_second': '1977'} +{'loss': '0.3871', 'grad_norm': '1.106', 'learning_rate': '5e-05', 'epoch': '0.1278', 'num_input_tokens_seen': 10392619, 'train_runtime': '5257', 'train_tokens_per_second': '1977'} +{'loss': '0.4397', 'grad_norm': '1.341', 'learning_rate': '5e-05', 'epoch': '0.1279', 'num_input_tokens_seen': 10394666, 'train_runtime': '5258', 'train_tokens_per_second': '1977'} +{'loss': '0.842', 'grad_norm': '1.558', 'learning_rate': '5e-05', 'epoch': '0.1279', 'num_input_tokens_seen': 10396713, 'train_runtime': '5259', 'train_tokens_per_second': '1977'} +{'loss': '0.7421', 'grad_norm': '1.701', 'learning_rate': '5e-05', 'epoch': '0.1279', 'num_input_tokens_seen': 10398760, 'train_runtime': '5260', 'train_tokens_per_second': '1977'} +{'loss': '0.4751', 'grad_norm': '1.164', 'learning_rate': '5e-05', 'epoch': '0.1279', 'num_input_tokens_seen': 10400807, 'train_runtime': '5261', 'train_tokens_per_second': '1977'} +{'loss': '0.546', 'grad_norm': '1.423', 'learning_rate': '5e-05', 'epoch': '0.128', 'num_input_tokens_seen': 10402854, 'train_runtime': '5262', 'train_tokens_per_second': '1977'} +{'loss': '1.22', 'grad_norm': '2.019', 'learning_rate': '5e-05', 'epoch': '0.128', 'num_input_tokens_seen': 10404901, 'train_runtime': '5263', 'train_tokens_per_second': '1977'} +{'loss': '0.4284', 'grad_norm': '1.243', 'learning_rate': '5e-05', 'epoch': '0.128', 'num_input_tokens_seen': 10406948, 'train_runtime': '5264', 'train_tokens_per_second': '1977'} +{'loss': '0.8675', 'grad_norm': '1.56', 'learning_rate': '5e-05', 'epoch': '0.128', 'num_input_tokens_seen': 10408995, 'train_runtime': '5265', 'train_tokens_per_second': '1977'} +{'loss': '0.5194', 'grad_norm': '1.665', 'learning_rate': '5e-05', 'epoch': '0.1281', 'num_input_tokens_seen': 10411042, 'train_runtime': '5266', 'train_tokens_per_second': '1977'} +{'loss': '0.3927', 'grad_norm': '1.397', 'learning_rate': '5e-05', 'epoch': '0.1281', 'num_input_tokens_seen': 10413089, 'train_runtime': '5267', 'train_tokens_per_second': '1977'} +{'loss': '1.06', 'grad_norm': '1.862', 'learning_rate': '5e-05', 'epoch': '0.1281', 'num_input_tokens_seen': 10415136, 'train_runtime': '5268', 'train_tokens_per_second': '1977'} +{'loss': '0.8793', 'grad_norm': '1.789', 'learning_rate': '5e-05', 'epoch': '0.1281', 'num_input_tokens_seen': 10417183, 'train_runtime': '5269', 'train_tokens_per_second': '1977'} +{'loss': '0.4088', 'grad_norm': '0.9754', 'learning_rate': '5e-05', 'epoch': '0.1282', 'num_input_tokens_seen': 10419230, 'train_runtime': '5270', 'train_tokens_per_second': '1977'} +{'loss': '0.7535', 'grad_norm': '1.723', 'learning_rate': '5e-05', 'epoch': '0.1282', 'num_input_tokens_seen': 10421277, 'train_runtime': '5271', 'train_tokens_per_second': '1977'} +{'loss': '0.4434', 'grad_norm': '1.109', 'learning_rate': '5e-05', 'epoch': '0.1282', 'num_input_tokens_seen': 10423324, 'train_runtime': '5273', 'train_tokens_per_second': '1977'} +{'loss': '0.4599', 'grad_norm': '0.9447', 'learning_rate': '5e-05', 'epoch': '0.1282', 'num_input_tokens_seen': 10425371, 'train_runtime': '5274', 'train_tokens_per_second': '1977'} +{'loss': '0.3642', 'grad_norm': '1.008', 'learning_rate': '5e-05', 'epoch': '0.1283', 'num_input_tokens_seen': 10427418, 'train_runtime': '5275', 'train_tokens_per_second': '1977'} +{'loss': '2.31', 'grad_norm': '2.328', 'learning_rate': '5e-05', 'epoch': '0.1283', 'num_input_tokens_seen': 10429465, 'train_runtime': '5276', 'train_tokens_per_second': '1977'} +{'loss': '0.4931', 'grad_norm': '1.385', 'learning_rate': '5e-05', 'epoch': '0.1283', 'num_input_tokens_seen': 10431512, 'train_runtime': '5277', 'train_tokens_per_second': '1977'} +{'loss': '0.4735', 'grad_norm': '0.934', 'learning_rate': '5e-05', 'epoch': '0.1283', 'num_input_tokens_seen': 10433559, 'train_runtime': '5278', 'train_tokens_per_second': '1977'} +{'loss': '1.01', 'grad_norm': '1.738', 'learning_rate': '5e-05', 'epoch': '0.1284', 'num_input_tokens_seen': 10435606, 'train_runtime': '5279', 'train_tokens_per_second': '1977'} +{'loss': '1.08', 'grad_norm': '2.241', 'learning_rate': '5e-05', 'epoch': '0.1284', 'num_input_tokens_seen': 10437653, 'train_runtime': '5280', 'train_tokens_per_second': '1977'} +{'loss': '1.42', 'grad_norm': '2.175', 'learning_rate': '5e-05', 'epoch': '0.1284', 'num_input_tokens_seen': 10439700, 'train_runtime': '5281', 'train_tokens_per_second': '1977'} +{'loss': '1.507', 'grad_norm': '2.285', 'learning_rate': '5e-05', 'epoch': '0.1284', 'num_input_tokens_seen': 10441747, 'train_runtime': '5282', 'train_tokens_per_second': '1977'} +{'loss': '0.4478', 'grad_norm': '0.9786', 'learning_rate': '5e-05', 'epoch': '0.1285', 'num_input_tokens_seen': 10443794, 'train_runtime': '5283', 'train_tokens_per_second': '1977'} +{'loss': '0.4804', 'grad_norm': '1.3', 'learning_rate': '5e-05', 'epoch': '0.1285', 'num_input_tokens_seen': 10445841, 'train_runtime': '5284', 'train_tokens_per_second': '1977'} +{'loss': '0.246', 'grad_norm': '1.209', 'learning_rate': '5e-05', 'epoch': '0.1285', 'num_input_tokens_seen': 10447888, 'train_runtime': '5285', 'train_tokens_per_second': '1977'} +{'loss': '0.3791', 'grad_norm': '1.243', 'learning_rate': '5e-05', 'epoch': '0.1285', 'num_input_tokens_seen': 10449935, 'train_runtime': '5286', 'train_tokens_per_second': '1977'} +{'loss': '0.6811', 'grad_norm': '1.286', 'learning_rate': '5e-05', 'epoch': '0.1286', 'num_input_tokens_seen': 10451982, 'train_runtime': '5287', 'train_tokens_per_second': '1977'} +{'loss': '0.4227', 'grad_norm': '1.152', 'learning_rate': '5e-05', 'epoch': '0.1286', 'num_input_tokens_seen': 10454029, 'train_runtime': '5288', 'train_tokens_per_second': '1977'} +{'loss': '0.2784', 'grad_norm': '1.128', 'learning_rate': '5e-05', 'epoch': '0.1286', 'num_input_tokens_seen': 10456076, 'train_runtime': '5289', 'train_tokens_per_second': '1977'} +{'loss': '1.747', 'grad_norm': '2.353', 'learning_rate': '5e-05', 'epoch': '0.1286', 'num_input_tokens_seen': 10458123, 'train_runtime': '5290', 'train_tokens_per_second': '1977'} +{'loss': '0.5895', 'grad_norm': '1.185', 'learning_rate': '5e-05', 'epoch': '0.1287', 'num_input_tokens_seen': 10460170, 'train_runtime': '5291', 'train_tokens_per_second': '1977'} +{'loss': '1.356', 'grad_norm': '2.059', 'learning_rate': '5e-05', 'epoch': '0.1287', 'num_input_tokens_seen': 10462217, 'train_runtime': '5292', 'train_tokens_per_second': '1977'} +{'loss': '0.4909', 'grad_norm': '1.057', 'learning_rate': '5e-05', 'epoch': '0.1287', 'num_input_tokens_seen': 10464264, 'train_runtime': '5293', 'train_tokens_per_second': '1977'} +{'loss': '1.605', 'grad_norm': '2.252', 'learning_rate': '5e-05', 'epoch': '0.1287', 'num_input_tokens_seen': 10466311, 'train_runtime': '5294', 'train_tokens_per_second': '1977'} +{'loss': '0.7183', 'grad_norm': '1.561', 'learning_rate': '5e-05', 'epoch': '0.1288', 'num_input_tokens_seen': 10468358, 'train_runtime': '5295', 'train_tokens_per_second': '1977'} +{'loss': '1.744', 'grad_norm': '2.576', 'learning_rate': '5e-05', 'epoch': '0.1288', 'num_input_tokens_seen': 10470405, 'train_runtime': '5296', 'train_tokens_per_second': '1977'} +{'loss': '1.061', 'grad_norm': '2.25', 'learning_rate': '5e-05', 'epoch': '0.1288', 'num_input_tokens_seen': 10472452, 'train_runtime': '5297', 'train_tokens_per_second': '1977'} +{'loss': '0.4324', 'grad_norm': '1.198', 'learning_rate': '5e-05', 'epoch': '0.1288', 'num_input_tokens_seen': 10474499, 'train_runtime': '5298', 'train_tokens_per_second': '1977'} +{'loss': '0.8818', 'grad_norm': '1.721', 'learning_rate': '5e-05', 'epoch': '0.1289', 'num_input_tokens_seen': 10476546, 'train_runtime': '5299', 'train_tokens_per_second': '1977'} +{'loss': '1.712', 'grad_norm': '2.428', 'learning_rate': '5e-05', 'epoch': '0.1289', 'num_input_tokens_seen': 10478593, 'train_runtime': '5300', 'train_tokens_per_second': '1977'} +{'loss': '0.7309', 'grad_norm': '1.206', 'learning_rate': '5e-05', 'epoch': '0.1289', 'num_input_tokens_seen': 10480640, 'train_runtime': '5302', 'train_tokens_per_second': '1977'} +{'loss': '1.258', 'grad_norm': '2.107', 'learning_rate': '5e-05', 'epoch': '0.1289', 'num_input_tokens_seen': 10482687, 'train_runtime': '5303', 'train_tokens_per_second': '1977'} +{'loss': '1.779', 'grad_norm': '2.352', 'learning_rate': '5e-05', 'epoch': '0.129', 'num_input_tokens_seen': 10484734, 'train_runtime': '5304', 'train_tokens_per_second': '1977'} +{'loss': '1.516', 'grad_norm': '2.137', 'learning_rate': '5e-05', 'epoch': '0.129', 'num_input_tokens_seen': 10486781, 'train_runtime': '5305', 'train_tokens_per_second': '1977'} +{'loss': '0.8596', 'grad_norm': '1.746', 'learning_rate': '5e-05', 'epoch': '0.129', 'num_input_tokens_seen': 10488828, 'train_runtime': '5306', 'train_tokens_per_second': '1977'} +{'loss': '0.6985', 'grad_norm': '1.904', 'learning_rate': '5e-05', 'epoch': '0.129', 'num_input_tokens_seen': 10490875, 'train_runtime': '5307', 'train_tokens_per_second': '1977'} +{'loss': '1.688', 'grad_norm': '2.198', 'learning_rate': '5e-05', 'epoch': '0.1291', 'num_input_tokens_seen': 10492922, 'train_runtime': '5308', 'train_tokens_per_second': '1977'} +{'loss': '0.6643', 'grad_norm': '1.647', 'learning_rate': '5e-05', 'epoch': '0.1291', 'num_input_tokens_seen': 10494969, 'train_runtime': '5309', 'train_tokens_per_second': '1977'} +{'loss': '0.4154', 'grad_norm': '1.071', 'learning_rate': '5e-05', 'epoch': '0.1291', 'num_input_tokens_seen': 10497016, 'train_runtime': '5310', 'train_tokens_per_second': '1977'} +{'loss': '0.5134', 'grad_norm': '1.135', 'learning_rate': '5e-05', 'epoch': '0.1291', 'num_input_tokens_seen': 10499063, 'train_runtime': '5311', 'train_tokens_per_second': '1977'} +{'loss': '0.7214', 'grad_norm': '1.457', 'learning_rate': '5e-05', 'epoch': '0.1292', 'num_input_tokens_seen': 10501110, 'train_runtime': '5312', 'train_tokens_per_second': '1977'} +{'loss': '0.3944', 'grad_norm': '0.8144', 'learning_rate': '5e-05', 'epoch': '0.1292', 'num_input_tokens_seen': 10503157, 'train_runtime': '5313', 'train_tokens_per_second': '1977'} +{'loss': '1.979', 'grad_norm': '2.389', 'learning_rate': '5e-05', 'epoch': '0.1292', 'num_input_tokens_seen': 10505204, 'train_runtime': '5314', 'train_tokens_per_second': '1977'} +{'loss': '0.6556', 'grad_norm': '1.332', 'learning_rate': '5e-05', 'epoch': '0.1292', 'num_input_tokens_seen': 10507251, 'train_runtime': '5315', 'train_tokens_per_second': '1977'} +{'loss': '0.8221', 'grad_norm': '1.685', 'learning_rate': '5e-05', 'epoch': '0.1293', 'num_input_tokens_seen': 10509298, 'train_runtime': '5316', 'train_tokens_per_second': '1977'} +{'loss': '1.545', 'grad_norm': '2.056', 'learning_rate': '5e-05', 'epoch': '0.1293', 'num_input_tokens_seen': 10511345, 'train_runtime': '5317', 'train_tokens_per_second': '1977'} +{'loss': '0.3154', 'grad_norm': '0.9967', 'learning_rate': '5e-05', 'epoch': '0.1293', 'num_input_tokens_seen': 10513392, 'train_runtime': '5318', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.796', 'learning_rate': '5e-05', 'epoch': '0.1293', 'num_input_tokens_seen': 10515439, 'train_runtime': '5319', 'train_tokens_per_second': '1977'} +{'loss': '0.9826', 'grad_norm': '1.624', 'learning_rate': '5e-05', 'epoch': '0.1294', 'num_input_tokens_seen': 10517486, 'train_runtime': '5320', 'train_tokens_per_second': '1977'} +{'loss': '0.8368', 'grad_norm': '1.581', 'learning_rate': '5e-05', 'epoch': '0.1294', 'num_input_tokens_seen': 10519533, 'train_runtime': '5321', 'train_tokens_per_second': '1977'} +{'loss': '1.027', 'grad_norm': '1.583', 'learning_rate': '5e-05', 'epoch': '0.1294', 'num_input_tokens_seen': 10521580, 'train_runtime': '5322', 'train_tokens_per_second': '1977'} +{'loss': '1.923', 'grad_norm': '2.955', 'learning_rate': '5e-05', 'epoch': '0.1294', 'num_input_tokens_seen': 10523627, 'train_runtime': '5323', 'train_tokens_per_second': '1977'} +{'loss': '0.3884', 'grad_norm': '1.017', 'learning_rate': '5e-05', 'epoch': '0.1295', 'num_input_tokens_seen': 10525674, 'train_runtime': '5324', 'train_tokens_per_second': '1977'} +{'loss': '2.6', 'grad_norm': '3.208', 'learning_rate': '5e-05', 'epoch': '0.1295', 'num_input_tokens_seen': 10527721, 'train_runtime': '5325', 'train_tokens_per_second': '1977'} +{'loss': '0.6124', 'grad_norm': '1.341', 'learning_rate': '5e-05', 'epoch': '0.1295', 'num_input_tokens_seen': 10529768, 'train_runtime': '5326', 'train_tokens_per_second': '1977'} +{'loss': '0.9004', 'grad_norm': '1.515', 'learning_rate': '5e-05', 'epoch': '0.1295', 'num_input_tokens_seen': 10531815, 'train_runtime': '5327', 'train_tokens_per_second': '1977'} +{'loss': '1.432', 'grad_norm': '1.916', 'learning_rate': '5e-05', 'epoch': '0.1296', 'num_input_tokens_seen': 10533862, 'train_runtime': '5328', 'train_tokens_per_second': '1977'} +{'loss': '0.3567', 'grad_norm': '1.113', 'learning_rate': '5e-05', 'epoch': '0.1296', 'num_input_tokens_seen': 10535909, 'train_runtime': '5329', 'train_tokens_per_second': '1977'} +{'loss': '1.69', 'grad_norm': '2.371', 'learning_rate': '5e-05', 'epoch': '0.1296', 'num_input_tokens_seen': 10537956, 'train_runtime': '5330', 'train_tokens_per_second': '1977'} +{'loss': '0.4352', 'grad_norm': '1.2', 'learning_rate': '5e-05', 'epoch': '0.1296', 'num_input_tokens_seen': 10540003, 'train_runtime': '5332', 'train_tokens_per_second': '1977'} +{'loss': '0.2858', 'grad_norm': '1.046', 'learning_rate': '5e-05', 'epoch': '0.1297', 'num_input_tokens_seen': 10542050, 'train_runtime': '5333', 'train_tokens_per_second': '1977'} +{'loss': '0.524', 'grad_norm': '1.33', 'learning_rate': '5e-05', 'epoch': '0.1297', 'num_input_tokens_seen': 10544097, 'train_runtime': '5334', 'train_tokens_per_second': '1977'} +{'loss': '0.6089', 'grad_norm': '1.484', 'learning_rate': '5e-05', 'epoch': '0.1297', 'num_input_tokens_seen': 10546144, 'train_runtime': '5335', 'train_tokens_per_second': '1977'} +{'loss': '1.84', 'grad_norm': '2.808', 'learning_rate': '5e-05', 'epoch': '0.1297', 'num_input_tokens_seen': 10548191, 'train_runtime': '5336', 'train_tokens_per_second': '1977'} +{'loss': '1.305', 'grad_norm': '2.013', 'learning_rate': '5e-05', 'epoch': '0.1298', 'num_input_tokens_seen': 10550238, 'train_runtime': '5337', 'train_tokens_per_second': '1977'} +{'loss': '1.513', 'grad_norm': '2.344', 'learning_rate': '5e-05', 'epoch': '0.1298', 'num_input_tokens_seen': 10552285, 'train_runtime': '5338', 'train_tokens_per_second': '1977'} +{'loss': '0.4475', 'grad_norm': '1.347', 'learning_rate': '5e-05', 'epoch': '0.1298', 'num_input_tokens_seen': 10554332, 'train_runtime': '5339', 'train_tokens_per_second': '1977'} +{'loss': '0.2859', 'grad_norm': '1.49', 'learning_rate': '5e-05', 'epoch': '0.1298', 'num_input_tokens_seen': 10556379, 'train_runtime': '5340', 'train_tokens_per_second': '1977'} +{'loss': '1.613', 'grad_norm': '2.242', 'learning_rate': '5e-05', 'epoch': '0.1299', 'num_input_tokens_seen': 10558426, 'train_runtime': '5341', 'train_tokens_per_second': '1977'} +{'loss': '2.267', 'grad_norm': '2.731', 'learning_rate': '5e-05', 'epoch': '0.1299', 'num_input_tokens_seen': 10560473, 'train_runtime': '5342', 'train_tokens_per_second': '1977'} +{'loss': '0.6818', 'grad_norm': '1.815', 'learning_rate': '5e-05', 'epoch': '0.1299', 'num_input_tokens_seen': 10562520, 'train_runtime': '5343', 'train_tokens_per_second': '1977'} +{'loss': '0.8271', 'grad_norm': '1.693', 'learning_rate': '5e-05', 'epoch': '0.1299', 'num_input_tokens_seen': 10564567, 'train_runtime': '5344', 'train_tokens_per_second': '1977'} +{'loss': '1.176', 'grad_norm': '2.237', 'learning_rate': '5e-05', 'epoch': '0.13', 'num_input_tokens_seen': 10566614, 'train_runtime': '5345', 'train_tokens_per_second': '1977'} +{'loss': '0.6253', 'grad_norm': '1.468', 'learning_rate': '5e-05', 'epoch': '0.13', 'num_input_tokens_seen': 10568661, 'train_runtime': '5346', 'train_tokens_per_second': '1977'} +{'loss': '1.085', 'grad_norm': '1.818', 'learning_rate': '5e-05', 'epoch': '0.13', 'num_input_tokens_seen': 10570708, 'train_runtime': '5347', 'train_tokens_per_second': '1977'} +{'loss': '1.556', 'grad_norm': '1.58', 'learning_rate': '5e-05', 'epoch': '0.13', 'num_input_tokens_seen': 10572755, 'train_runtime': '5348', 'train_tokens_per_second': '1977'} +{'loss': '1.208', 'grad_norm': '1.785', 'learning_rate': '5e-05', 'epoch': '0.1301', 'num_input_tokens_seen': 10574802, 'train_runtime': '5349', 'train_tokens_per_second': '1977'} +{'loss': '0.4852', 'grad_norm': '1.161', 'learning_rate': '5e-05', 'epoch': '0.1301', 'num_input_tokens_seen': 10576849, 'train_runtime': '5350', 'train_tokens_per_second': '1977'} +{'loss': '0.3246', 'grad_norm': '0.8964', 'learning_rate': '5e-05', 'epoch': '0.1301', 'num_input_tokens_seen': 10578896, 'train_runtime': '5351', 'train_tokens_per_second': '1977'} +{'loss': '0.3916', 'grad_norm': '1.034', 'learning_rate': '5e-05', 'epoch': '0.1301', 'num_input_tokens_seen': 10580943, 'train_runtime': '5352', 'train_tokens_per_second': '1977'} +{'loss': '0.9108', 'grad_norm': '1.474', 'learning_rate': '5e-05', 'epoch': '0.1302', 'num_input_tokens_seen': 10582990, 'train_runtime': '5353', 'train_tokens_per_second': '1977'} +{'loss': '0.8346', 'grad_norm': '1.4', 'learning_rate': '5e-05', 'epoch': '0.1302', 'num_input_tokens_seen': 10585037, 'train_runtime': '5354', 'train_tokens_per_second': '1977'} +{'loss': '0.5201', 'grad_norm': '1.51', 'learning_rate': '5e-05', 'epoch': '0.1302', 'num_input_tokens_seen': 10587084, 'train_runtime': '5355', 'train_tokens_per_second': '1977'} +{'loss': '0.3845', 'grad_norm': '0.9929', 'learning_rate': '5e-05', 'epoch': '0.1302', 'num_input_tokens_seen': 10589131, 'train_runtime': '5356', 'train_tokens_per_second': '1977'} +{'loss': '0.7522', 'grad_norm': '1.536', 'learning_rate': '5e-05', 'epoch': '0.1303', 'num_input_tokens_seen': 10591178, 'train_runtime': '5357', 'train_tokens_per_second': '1977'} +{'loss': '1.483', 'grad_norm': '1.927', 'learning_rate': '5e-05', 'epoch': '0.1303', 'num_input_tokens_seen': 10593225, 'train_runtime': '5358', 'train_tokens_per_second': '1977'} +{'loss': '0.4441', 'grad_norm': '1.301', 'learning_rate': '5e-05', 'epoch': '0.1303', 'num_input_tokens_seen': 10595272, 'train_runtime': '5359', 'train_tokens_per_second': '1977'} +{'loss': '0.6699', 'grad_norm': '1.343', 'learning_rate': '5e-05', 'epoch': '0.1303', 'num_input_tokens_seen': 10597319, 'train_runtime': '5360', 'train_tokens_per_second': '1977'} +{'loss': '0.3903', 'grad_norm': '1.09', 'learning_rate': '5e-05', 'epoch': '0.1304', 'num_input_tokens_seen': 10599366, 'train_runtime': '5362', 'train_tokens_per_second': '1977'} +{'loss': '0.8463', 'grad_norm': '1.719', 'learning_rate': '5e-05', 'epoch': '0.1304', 'num_input_tokens_seen': 10601413, 'train_runtime': '5363', 'train_tokens_per_second': '1977'} +{'loss': '0.829', 'grad_norm': '1.296', 'learning_rate': '5e-05', 'epoch': '0.1304', 'num_input_tokens_seen': 10603460, 'train_runtime': '5364', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '1.769', 'learning_rate': '5e-05', 'epoch': '0.1304', 'num_input_tokens_seen': 10605507, 'train_runtime': '5365', 'train_tokens_per_second': '1977'} +{'loss': '0.7307', 'grad_norm': '1.382', 'learning_rate': '5e-05', 'epoch': '0.1305', 'num_input_tokens_seen': 10607554, 'train_runtime': '5366', 'train_tokens_per_second': '1977'} +{'loss': '1.012', 'grad_norm': '1.94', 'learning_rate': '5e-05', 'epoch': '0.1305', 'num_input_tokens_seen': 10609601, 'train_runtime': '5367', 'train_tokens_per_second': '1977'} +{'loss': '0.4525', 'grad_norm': '1.453', 'learning_rate': '5e-05', 'epoch': '0.1305', 'num_input_tokens_seen': 10611648, 'train_runtime': '5368', 'train_tokens_per_second': '1977'} +{'loss': '0.6815', 'grad_norm': '1.274', 'learning_rate': '5e-05', 'epoch': '0.1305', 'num_input_tokens_seen': 10613695, 'train_runtime': '5369', 'train_tokens_per_second': '1977'} +{'loss': '1.384', 'grad_norm': '1.961', 'learning_rate': '5e-05', 'epoch': '0.1306', 'num_input_tokens_seen': 10615742, 'train_runtime': '5370', 'train_tokens_per_second': '1977'} +{'loss': '0.3781', 'grad_norm': '1.066', 'learning_rate': '5e-05', 'epoch': '0.1306', 'num_input_tokens_seen': 10617789, 'train_runtime': '5371', 'train_tokens_per_second': '1977'} +{'loss': '0.3838', 'grad_norm': '1.173', 'learning_rate': '5e-05', 'epoch': '0.1306', 'num_input_tokens_seen': 10619836, 'train_runtime': '5372', 'train_tokens_per_second': '1977'} +{'loss': '0.9751', 'grad_norm': '1.604', 'learning_rate': '5e-05', 'epoch': '0.1306', 'num_input_tokens_seen': 10621883, 'train_runtime': '5373', 'train_tokens_per_second': '1977'} +{'loss': '0.3709', 'grad_norm': '1.089', 'learning_rate': '5e-05', 'epoch': '0.1307', 'num_input_tokens_seen': 10623930, 'train_runtime': '5374', 'train_tokens_per_second': '1977'} +{'loss': '0.2542', 'grad_norm': '0.9785', 'learning_rate': '5e-05', 'epoch': '0.1307', 'num_input_tokens_seen': 10625977, 'train_runtime': '5375', 'train_tokens_per_second': '1977'} +{'loss': '0.9783', 'grad_norm': '1.968', 'learning_rate': '5e-05', 'epoch': '0.1307', 'num_input_tokens_seen': 10628024, 'train_runtime': '5376', 'train_tokens_per_second': '1977'} +{'loss': '0.4592', 'grad_norm': '1.014', 'learning_rate': '5e-05', 'epoch': '0.1308', 'num_input_tokens_seen': 10630071, 'train_runtime': '5377', 'train_tokens_per_second': '1977'} +{'loss': '0.2741', 'grad_norm': '1.076', 'learning_rate': '5e-05', 'epoch': '0.1308', 'num_input_tokens_seen': 10632118, 'train_runtime': '5378', 'train_tokens_per_second': '1977'} +{'loss': '0.6454', 'grad_norm': '1.041', 'learning_rate': '5e-05', 'epoch': '0.1308', 'num_input_tokens_seen': 10634165, 'train_runtime': '5379', 'train_tokens_per_second': '1977'} +{'loss': '0.2743', 'grad_norm': '1.218', 'learning_rate': '5e-05', 'epoch': '0.1308', 'num_input_tokens_seen': 10636212, 'train_runtime': '5380', 'train_tokens_per_second': '1977'} +{'loss': '0.6005', 'grad_norm': '1.484', 'learning_rate': '5e-05', 'epoch': '0.1309', 'num_input_tokens_seen': 10638259, 'train_runtime': '5381', 'train_tokens_per_second': '1977'} +{'loss': '0.631', 'grad_norm': '1.547', 'learning_rate': '5e-05', 'epoch': '0.1309', 'num_input_tokens_seen': 10640306, 'train_runtime': '5382', 'train_tokens_per_second': '1977'} +{'loss': '1.54', 'grad_norm': '2.205', 'learning_rate': '5e-05', 'epoch': '0.1309', 'num_input_tokens_seen': 10642353, 'train_runtime': '5383', 'train_tokens_per_second': '1977'} +{'loss': '0.4569', 'grad_norm': '1.661', 'learning_rate': '5e-05', 'epoch': '0.1309', 'num_input_tokens_seen': 10644400, 'train_runtime': '5384', 'train_tokens_per_second': '1977'} +{'loss': '0.7088', 'grad_norm': '1.738', 'learning_rate': '5e-05', 'epoch': '0.131', 'num_input_tokens_seen': 10646447, 'train_runtime': '5385', 'train_tokens_per_second': '1977'} +{'loss': '1.743', 'grad_norm': '2.045', 'learning_rate': '5e-05', 'epoch': '0.131', 'num_input_tokens_seen': 10648494, 'train_runtime': '5386', 'train_tokens_per_second': '1977'} +{'loss': '1.076', 'grad_norm': '1.976', 'learning_rate': '5e-05', 'epoch': '0.131', 'num_input_tokens_seen': 10650541, 'train_runtime': '5387', 'train_tokens_per_second': '1977'} +{'loss': '0.3491', 'grad_norm': '1.651', 'learning_rate': '5e-05', 'epoch': '0.131', 'num_input_tokens_seen': 10652588, 'train_runtime': '5388', 'train_tokens_per_second': '1977'} +{'loss': '0.6189', 'grad_norm': '1.435', 'learning_rate': '5e-05', 'epoch': '0.1311', 'num_input_tokens_seen': 10654635, 'train_runtime': '5389', 'train_tokens_per_second': '1977'} +{'loss': '1.143', 'grad_norm': '1.903', 'learning_rate': '5e-05', 'epoch': '0.1311', 'num_input_tokens_seen': 10656682, 'train_runtime': '5391', 'train_tokens_per_second': '1977'} +{'loss': '0.3883', 'grad_norm': '1.149', 'learning_rate': '5e-05', 'epoch': '0.1311', 'num_input_tokens_seen': 10658729, 'train_runtime': '5392', 'train_tokens_per_second': '1977'} +{'loss': '1.073', 'grad_norm': '1.878', 'learning_rate': '5e-05', 'epoch': '0.1311', 'num_input_tokens_seen': 10660776, 'train_runtime': '5393', 'train_tokens_per_second': '1977'} +{'loss': '0.9241', 'grad_norm': '1.678', 'learning_rate': '5e-05', 'epoch': '0.1312', 'num_input_tokens_seen': 10662823, 'train_runtime': '5394', 'train_tokens_per_second': '1977'} +{'loss': '1.524', 'grad_norm': '2', 'learning_rate': '5e-05', 'epoch': '0.1312', 'num_input_tokens_seen': 10664870, 'train_runtime': '5395', 'train_tokens_per_second': '1977'} +{'loss': '0.7914', 'grad_norm': '1.241', 'learning_rate': '5e-05', 'epoch': '0.1312', 'num_input_tokens_seen': 10666917, 'train_runtime': '5396', 'train_tokens_per_second': '1977'} +{'loss': '1.089', 'grad_norm': '1.742', 'learning_rate': '4.999e-05', 'epoch': '0.1312', 'num_input_tokens_seen': 10668964, 'train_runtime': '5397', 'train_tokens_per_second': '1977'} +{'loss': '0.7471', 'grad_norm': '1.553', 'learning_rate': '4.999e-05', 'epoch': '0.1313', 'num_input_tokens_seen': 10671011, 'train_runtime': '5398', 'train_tokens_per_second': '1977'} +{'loss': '0.7376', 'grad_norm': '1.738', 'learning_rate': '4.999e-05', 'epoch': '0.1313', 'num_input_tokens_seen': 10673058, 'train_runtime': '5399', 'train_tokens_per_second': '1977'} +{'loss': '0.4271', 'grad_norm': '1.292', 'learning_rate': '4.999e-05', 'epoch': '0.1313', 'num_input_tokens_seen': 10675105, 'train_runtime': '5400', 'train_tokens_per_second': '1977'} +{'loss': '1.089', 'grad_norm': '2.184', 'learning_rate': '4.999e-05', 'epoch': '0.1313', 'num_input_tokens_seen': 10677152, 'train_runtime': '5401', 'train_tokens_per_second': '1977'} +{'loss': '0.267', 'grad_norm': '0.956', 'learning_rate': '4.999e-05', 'epoch': '0.1314', 'num_input_tokens_seen': 10679199, 'train_runtime': '5402', 'train_tokens_per_second': '1977'} +{'loss': '2.091', 'grad_norm': '1.968', 'learning_rate': '4.999e-05', 'epoch': '0.1314', 'num_input_tokens_seen': 10681246, 'train_runtime': '5403', 'train_tokens_per_second': '1977'} +{'loss': '1.669', 'grad_norm': '2.003', 'learning_rate': '4.999e-05', 'epoch': '0.1314', 'num_input_tokens_seen': 10683293, 'train_runtime': '5404', 'train_tokens_per_second': '1977'} +{'loss': '0.8942', 'grad_norm': '1.654', 'learning_rate': '4.999e-05', 'epoch': '0.1314', 'num_input_tokens_seen': 10685340, 'train_runtime': '5405', 'train_tokens_per_second': '1977'} +{'loss': '1.153', 'grad_norm': '1.861', 'learning_rate': '4.999e-05', 'epoch': '0.1315', 'num_input_tokens_seen': 10687387, 'train_runtime': '5406', 'train_tokens_per_second': '1977'} +{'loss': '0.8273', 'grad_norm': '1.246', 'learning_rate': '4.999e-05', 'epoch': '0.1315', 'num_input_tokens_seen': 10689434, 'train_runtime': '5407', 'train_tokens_per_second': '1977'} +{'loss': '0.4529', 'grad_norm': '1.068', 'learning_rate': '4.999e-05', 'epoch': '0.1315', 'num_input_tokens_seen': 10691481, 'train_runtime': '5408', 'train_tokens_per_second': '1977'} +{'loss': '0.8817', 'grad_norm': '1.7', 'learning_rate': '4.999e-05', 'epoch': '0.1315', 'num_input_tokens_seen': 10693528, 'train_runtime': '5409', 'train_tokens_per_second': '1977'} +{'loss': '0.9799', 'grad_norm': '1.794', 'learning_rate': '4.999e-05', 'epoch': '0.1316', 'num_input_tokens_seen': 10695575, 'train_runtime': '5410', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.831', 'learning_rate': '4.999e-05', 'epoch': '0.1316', 'num_input_tokens_seen': 10697622, 'train_runtime': '5411', 'train_tokens_per_second': '1977'} +{'loss': '0.391', 'grad_norm': '0.9385', 'learning_rate': '4.999e-05', 'epoch': '0.1316', 'num_input_tokens_seen': 10699669, 'train_runtime': '5412', 'train_tokens_per_second': '1977'} +{'loss': '1.635', 'grad_norm': '2.217', 'learning_rate': '4.999e-05', 'epoch': '0.1316', 'num_input_tokens_seen': 10701716, 'train_runtime': '5413', 'train_tokens_per_second': '1977'} +{'loss': '0.8374', 'grad_norm': '1.608', 'learning_rate': '4.999e-05', 'epoch': '0.1317', 'num_input_tokens_seen': 10703763, 'train_runtime': '5414', 'train_tokens_per_second': '1977'} +{'loss': '0.9846', 'grad_norm': '1.647', 'learning_rate': '4.999e-05', 'epoch': '0.1317', 'num_input_tokens_seen': 10705810, 'train_runtime': '5415', 'train_tokens_per_second': '1977'} +{'loss': '0.4149', 'grad_norm': '1.13', 'learning_rate': '4.999e-05', 'epoch': '0.1317', 'num_input_tokens_seen': 10707857, 'train_runtime': '5416', 'train_tokens_per_second': '1977'} +{'loss': '1.557', 'grad_norm': '2.466', 'learning_rate': '4.999e-05', 'epoch': '0.1317', 'num_input_tokens_seen': 10709904, 'train_runtime': '5417', 'train_tokens_per_second': '1977'} +{'loss': '0.6876', 'grad_norm': '1.215', 'learning_rate': '4.999e-05', 'epoch': '0.1318', 'num_input_tokens_seen': 10711951, 'train_runtime': '5418', 'train_tokens_per_second': '1977'} +{'loss': '0.4303', 'grad_norm': '1.08', 'learning_rate': '4.999e-05', 'epoch': '0.1318', 'num_input_tokens_seen': 10713998, 'train_runtime': '5420', 'train_tokens_per_second': '1977'} +{'loss': '0.9492', 'grad_norm': '1.805', 'learning_rate': '4.999e-05', 'epoch': '0.1318', 'num_input_tokens_seen': 10716045, 'train_runtime': '5421', 'train_tokens_per_second': '1977'} +{'loss': '0.9554', 'grad_norm': '1.505', 'learning_rate': '4.999e-05', 'epoch': '0.1318', 'num_input_tokens_seen': 10718092, 'train_runtime': '5422', 'train_tokens_per_second': '1977'} +{'loss': '2.113', 'grad_norm': '2.736', 'learning_rate': '4.999e-05', 'epoch': '0.1319', 'num_input_tokens_seen': 10720139, 'train_runtime': '5423', 'train_tokens_per_second': '1977'} +{'loss': '1.297', 'grad_norm': '1.937', 'learning_rate': '4.999e-05', 'epoch': '0.1319', 'num_input_tokens_seen': 10722186, 'train_runtime': '5424', 'train_tokens_per_second': '1977'} +{'loss': '0.8844', 'grad_norm': '1.874', 'learning_rate': '4.999e-05', 'epoch': '0.1319', 'num_input_tokens_seen': 10724233, 'train_runtime': '5425', 'train_tokens_per_second': '1977'} +{'loss': '1.964', 'grad_norm': '2.676', 'learning_rate': '4.999e-05', 'epoch': '0.1319', 'num_input_tokens_seen': 10726280, 'train_runtime': '5426', 'train_tokens_per_second': '1977'} +{'loss': '1.354', 'grad_norm': '1.985', 'learning_rate': '4.999e-05', 'epoch': '0.132', 'num_input_tokens_seen': 10728327, 'train_runtime': '5427', 'train_tokens_per_second': '1977'} +{'loss': '0.9881', 'grad_norm': '1.473', 'learning_rate': '4.999e-05', 'epoch': '0.132', 'num_input_tokens_seen': 10730374, 'train_runtime': '5428', 'train_tokens_per_second': '1977'} +{'loss': '0.7286', 'grad_norm': '1.218', 'learning_rate': '4.999e-05', 'epoch': '0.132', 'num_input_tokens_seen': 10732421, 'train_runtime': '5429', 'train_tokens_per_second': '1977'} +{'loss': '0.668', 'grad_norm': '1.344', 'learning_rate': '4.999e-05', 'epoch': '0.132', 'num_input_tokens_seen': 10734468, 'train_runtime': '5430', 'train_tokens_per_second': '1977'} +{'loss': '0.4096', 'grad_norm': '1.643', 'learning_rate': '4.999e-05', 'epoch': '0.1321', 'num_input_tokens_seen': 10736515, 'train_runtime': '5431', 'train_tokens_per_second': '1977'} +{'loss': '0.9395', 'grad_norm': '1.715', 'learning_rate': '4.999e-05', 'epoch': '0.1321', 'num_input_tokens_seen': 10738562, 'train_runtime': '5432', 'train_tokens_per_second': '1977'} +{'loss': '0.9411', 'grad_norm': '1.525', 'learning_rate': '4.999e-05', 'epoch': '0.1321', 'num_input_tokens_seen': 10740609, 'train_runtime': '5433', 'train_tokens_per_second': '1977'} +{'loss': '2.188', 'grad_norm': '2.317', 'learning_rate': '4.999e-05', 'epoch': '0.1321', 'num_input_tokens_seen': 10742656, 'train_runtime': '5434', 'train_tokens_per_second': '1977'} +{'loss': '0.6966', 'grad_norm': '1.462', 'learning_rate': '4.999e-05', 'epoch': '0.1322', 'num_input_tokens_seen': 10744703, 'train_runtime': '5435', 'train_tokens_per_second': '1977'} +{'loss': '0.8359', 'grad_norm': '1.389', 'learning_rate': '4.999e-05', 'epoch': '0.1322', 'num_input_tokens_seen': 10746750, 'train_runtime': '5436', 'train_tokens_per_second': '1977'} +{'loss': '1.754', 'grad_norm': '2.578', 'learning_rate': '4.999e-05', 'epoch': '0.1322', 'num_input_tokens_seen': 10748797, 'train_runtime': '5437', 'train_tokens_per_second': '1977'} +{'loss': '0.901', 'grad_norm': '1.842', 'learning_rate': '4.999e-05', 'epoch': '0.1322', 'num_input_tokens_seen': 10750844, 'train_runtime': '5438', 'train_tokens_per_second': '1977'} +{'loss': '0.9313', 'grad_norm': '1.35', 'learning_rate': '4.999e-05', 'epoch': '0.1323', 'num_input_tokens_seen': 10752891, 'train_runtime': '5439', 'train_tokens_per_second': '1977'} +{'loss': '0.297', 'grad_norm': '1.09', 'learning_rate': '4.999e-05', 'epoch': '0.1323', 'num_input_tokens_seen': 10754938, 'train_runtime': '5440', 'train_tokens_per_second': '1977'} +{'loss': '0.4376', 'grad_norm': '1.349', 'learning_rate': '4.999e-05', 'epoch': '0.1323', 'num_input_tokens_seen': 10756985, 'train_runtime': '5441', 'train_tokens_per_second': '1977'} +{'loss': '0.3521', 'grad_norm': '0.9408', 'learning_rate': '4.999e-05', 'epoch': '0.1323', 'num_input_tokens_seen': 10759032, 'train_runtime': '5442', 'train_tokens_per_second': '1977'} +{'loss': '0.6275', 'grad_norm': '1.393', 'learning_rate': '4.999e-05', 'epoch': '0.1324', 'num_input_tokens_seen': 10761079, 'train_runtime': '5443', 'train_tokens_per_second': '1977'} +{'loss': '0.9174', 'grad_norm': '1.542', 'learning_rate': '4.999e-05', 'epoch': '0.1324', 'num_input_tokens_seen': 10763126, 'train_runtime': '5444', 'train_tokens_per_second': '1977'} +{'loss': '0.7783', 'grad_norm': '1.67', 'learning_rate': '4.999e-05', 'epoch': '0.1324', 'num_input_tokens_seen': 10765173, 'train_runtime': '5445', 'train_tokens_per_second': '1977'} +{'loss': '1.288', 'grad_norm': '2.062', 'learning_rate': '4.999e-05', 'epoch': '0.1324', 'num_input_tokens_seen': 10767220, 'train_runtime': '5446', 'train_tokens_per_second': '1977'} +{'loss': '0.4167', 'grad_norm': '0.8339', 'learning_rate': '4.999e-05', 'epoch': '0.1325', 'num_input_tokens_seen': 10769267, 'train_runtime': '5448', 'train_tokens_per_second': '1977'} +{'loss': '0.97', 'grad_norm': '1.649', 'learning_rate': '4.999e-05', 'epoch': '0.1325', 'num_input_tokens_seen': 10771314, 'train_runtime': '5449', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.638', 'learning_rate': '4.999e-05', 'epoch': '0.1325', 'num_input_tokens_seen': 10773361, 'train_runtime': '5450', 'train_tokens_per_second': '1977'} +{'loss': '0.9489', 'grad_norm': '1.593', 'learning_rate': '4.999e-05', 'epoch': '0.1325', 'num_input_tokens_seen': 10775408, 'train_runtime': '5451', 'train_tokens_per_second': '1977'} +{'loss': '0.9821', 'grad_norm': '1.764', 'learning_rate': '4.999e-05', 'epoch': '0.1326', 'num_input_tokens_seen': 10777455, 'train_runtime': '5452', 'train_tokens_per_second': '1977'} +{'loss': '1.209', 'grad_norm': '1.787', 'learning_rate': '4.999e-05', 'epoch': '0.1326', 'num_input_tokens_seen': 10779502, 'train_runtime': '5453', 'train_tokens_per_second': '1977'} +{'loss': '0.3105', 'grad_norm': '1.196', 'learning_rate': '4.999e-05', 'epoch': '0.1326', 'num_input_tokens_seen': 10781549, 'train_runtime': '5454', 'train_tokens_per_second': '1977'} +{'loss': '0.798', 'grad_norm': '1.612', 'learning_rate': '4.999e-05', 'epoch': '0.1326', 'num_input_tokens_seen': 10783596, 'train_runtime': '5455', 'train_tokens_per_second': '1977'} +{'loss': '0.5318', 'grad_norm': '1.322', 'learning_rate': '4.999e-05', 'epoch': '0.1327', 'num_input_tokens_seen': 10785643, 'train_runtime': '5456', 'train_tokens_per_second': '1977'} +{'loss': '0.9767', 'grad_norm': '1.89', 'learning_rate': '4.999e-05', 'epoch': '0.1327', 'num_input_tokens_seen': 10787690, 'train_runtime': '5457', 'train_tokens_per_second': '1977'} +{'loss': '0.3964', 'grad_norm': '1.256', 'learning_rate': '4.999e-05', 'epoch': '0.1327', 'num_input_tokens_seen': 10789737, 'train_runtime': '5458', 'train_tokens_per_second': '1977'} +{'loss': '0.3158', 'grad_norm': '1.205', 'learning_rate': '4.999e-05', 'epoch': '0.1327', 'num_input_tokens_seen': 10791784, 'train_runtime': '5459', 'train_tokens_per_second': '1977'} +{'loss': '1.958', 'grad_norm': '2.772', 'learning_rate': '4.999e-05', 'epoch': '0.1328', 'num_input_tokens_seen': 10793831, 'train_runtime': '5460', 'train_tokens_per_second': '1977'} +{'loss': '1.294', 'grad_norm': '1.815', 'learning_rate': '4.999e-05', 'epoch': '0.1328', 'num_input_tokens_seen': 10795878, 'train_runtime': '5461', 'train_tokens_per_second': '1977'} +{'loss': '0.7952', 'grad_norm': '1.351', 'learning_rate': '4.999e-05', 'epoch': '0.1328', 'num_input_tokens_seen': 10797925, 'train_runtime': '5462', 'train_tokens_per_second': '1977'} +{'loss': '1.179', 'grad_norm': '1.825', 'learning_rate': '4.999e-05', 'epoch': '0.1328', 'num_input_tokens_seen': 10799972, 'train_runtime': '5463', 'train_tokens_per_second': '1977'} +{'loss': '0.6975', 'grad_norm': '1.419', 'learning_rate': '4.999e-05', 'epoch': '0.1329', 'num_input_tokens_seen': 10802019, 'train_runtime': '5464', 'train_tokens_per_second': '1977'} +{'loss': '0.762', 'grad_norm': '1.456', 'learning_rate': '4.999e-05', 'epoch': '0.1329', 'num_input_tokens_seen': 10804066, 'train_runtime': '5465', 'train_tokens_per_second': '1977'} +{'loss': '1.813', 'grad_norm': '2.362', 'learning_rate': '4.999e-05', 'epoch': '0.1329', 'num_input_tokens_seen': 10806113, 'train_runtime': '5466', 'train_tokens_per_second': '1977'} +{'loss': '0.4136', 'grad_norm': '1.168', 'learning_rate': '4.999e-05', 'epoch': '0.1329', 'num_input_tokens_seen': 10808160, 'train_runtime': '5467', 'train_tokens_per_second': '1977'} +{'loss': '0.7803', 'grad_norm': '1.71', 'learning_rate': '4.999e-05', 'epoch': '0.133', 'num_input_tokens_seen': 10810207, 'train_runtime': '5468', 'train_tokens_per_second': '1977'} +{'loss': '2.116', 'grad_norm': '2.631', 'learning_rate': '4.999e-05', 'epoch': '0.133', 'num_input_tokens_seen': 10812254, 'train_runtime': '5469', 'train_tokens_per_second': '1977'} +{'loss': '2.884', 'grad_norm': '2.614', 'learning_rate': '4.999e-05', 'epoch': '0.133', 'num_input_tokens_seen': 10814301, 'train_runtime': '5470', 'train_tokens_per_second': '1977'} +{'loss': '1.359', 'grad_norm': '2.035', 'learning_rate': '4.999e-05', 'epoch': '0.133', 'num_input_tokens_seen': 10816348, 'train_runtime': '5471', 'train_tokens_per_second': '1977'} +{'loss': '0.6', 'grad_norm': '1.135', 'learning_rate': '4.999e-05', 'epoch': '0.1331', 'num_input_tokens_seen': 10818395, 'train_runtime': '5472', 'train_tokens_per_second': '1977'} +{'loss': '0.7053', 'grad_norm': '1.767', 'learning_rate': '4.999e-05', 'epoch': '0.1331', 'num_input_tokens_seen': 10820442, 'train_runtime': '5473', 'train_tokens_per_second': '1977'} +{'loss': '0.7301', 'grad_norm': '1.302', 'learning_rate': '4.999e-05', 'epoch': '0.1331', 'num_input_tokens_seen': 10822489, 'train_runtime': '5474', 'train_tokens_per_second': '1977'} +{'loss': '0.9784', 'grad_norm': '1.496', 'learning_rate': '4.999e-05', 'epoch': '0.1331', 'num_input_tokens_seen': 10824536, 'train_runtime': '5475', 'train_tokens_per_second': '1977'} +{'loss': '0.7082', 'grad_norm': '1.418', 'learning_rate': '4.999e-05', 'epoch': '0.1332', 'num_input_tokens_seen': 10826583, 'train_runtime': '5476', 'train_tokens_per_second': '1977'} +{'loss': '0.4889', 'grad_norm': '1.17', 'learning_rate': '4.999e-05', 'epoch': '0.1332', 'num_input_tokens_seen': 10828630, 'train_runtime': '5478', 'train_tokens_per_second': '1977'} +{'loss': '1.186', 'grad_norm': '2.307', 'learning_rate': '4.999e-05', 'epoch': '0.1332', 'num_input_tokens_seen': 10830677, 'train_runtime': '5479', 'train_tokens_per_second': '1977'} +{'loss': '0.7065', 'grad_norm': '1.468', 'learning_rate': '4.999e-05', 'epoch': '0.1332', 'num_input_tokens_seen': 10832724, 'train_runtime': '5480', 'train_tokens_per_second': '1977'} +{'loss': '0.417', 'grad_norm': '1.352', 'learning_rate': '4.999e-05', 'epoch': '0.1333', 'num_input_tokens_seen': 10834771, 'train_runtime': '5481', 'train_tokens_per_second': '1977'} +{'loss': '0.3729', 'grad_norm': '1.105', 'learning_rate': '4.999e-05', 'epoch': '0.1333', 'num_input_tokens_seen': 10836818, 'train_runtime': '5482', 'train_tokens_per_second': '1977'} +{'loss': '1.422', 'grad_norm': '1.805', 'learning_rate': '4.999e-05', 'epoch': '0.1333', 'num_input_tokens_seen': 10838865, 'train_runtime': '5483', 'train_tokens_per_second': '1977'} +{'loss': '0.4757', 'grad_norm': '0.9105', 'learning_rate': '4.999e-05', 'epoch': '0.1333', 'num_input_tokens_seen': 10840912, 'train_runtime': '5484', 'train_tokens_per_second': '1977'} +{'loss': '0.3093', 'grad_norm': '0.9062', 'learning_rate': '4.999e-05', 'epoch': '0.1334', 'num_input_tokens_seen': 10842959, 'train_runtime': '5485', 'train_tokens_per_second': '1977'} +{'loss': '1.665', 'grad_norm': '2.264', 'learning_rate': '4.999e-05', 'epoch': '0.1334', 'num_input_tokens_seen': 10845006, 'train_runtime': '5486', 'train_tokens_per_second': '1977'} +{'loss': '0.3904', 'grad_norm': '1.034', 'learning_rate': '4.999e-05', 'epoch': '0.1334', 'num_input_tokens_seen': 10847053, 'train_runtime': '5487', 'train_tokens_per_second': '1977'} +{'loss': '0.3327', 'grad_norm': '1.063', 'learning_rate': '4.999e-05', 'epoch': '0.1334', 'num_input_tokens_seen': 10849100, 'train_runtime': '5488', 'train_tokens_per_second': '1977'} +{'loss': '0.988', 'grad_norm': '1.908', 'learning_rate': '4.999e-05', 'epoch': '0.1335', 'num_input_tokens_seen': 10851147, 'train_runtime': '5489', 'train_tokens_per_second': '1977'} +{'loss': '0.7759', 'grad_norm': '1.493', 'learning_rate': '4.999e-05', 'epoch': '0.1335', 'num_input_tokens_seen': 10853194, 'train_runtime': '5490', 'train_tokens_per_second': '1977'} +{'loss': '0.5316', 'grad_norm': '1.449', 'learning_rate': '4.999e-05', 'epoch': '0.1335', 'num_input_tokens_seen': 10855241, 'train_runtime': '5491', 'train_tokens_per_second': '1977'} +{'loss': '2.357', 'grad_norm': '2.078', 'learning_rate': '4.999e-05', 'epoch': '0.1335', 'num_input_tokens_seen': 10857288, 'train_runtime': '5492', 'train_tokens_per_second': '1977'} +{'loss': '0.5015', 'grad_norm': '1.2', 'learning_rate': '4.999e-05', 'epoch': '0.1336', 'num_input_tokens_seen': 10859335, 'train_runtime': '5493', 'train_tokens_per_second': '1977'} +{'loss': '0.3482', 'grad_norm': '1.235', 'learning_rate': '4.999e-05', 'epoch': '0.1336', 'num_input_tokens_seen': 10861382, 'train_runtime': '5494', 'train_tokens_per_second': '1977'} +{'loss': '0.8819', 'grad_norm': '1.368', 'learning_rate': '4.999e-05', 'epoch': '0.1336', 'num_input_tokens_seen': 10863429, 'train_runtime': '5495', 'train_tokens_per_second': '1977'} +{'loss': '1.193', 'grad_norm': '1.697', 'learning_rate': '4.999e-05', 'epoch': '0.1336', 'num_input_tokens_seen': 10865476, 'train_runtime': '5496', 'train_tokens_per_second': '1977'} +{'loss': '1.711', 'grad_norm': '2.634', 'learning_rate': '4.999e-05', 'epoch': '0.1337', 'num_input_tokens_seen': 10867523, 'train_runtime': '5497', 'train_tokens_per_second': '1977'} +{'loss': '0.9126', 'grad_norm': '1.417', 'learning_rate': '4.999e-05', 'epoch': '0.1337', 'num_input_tokens_seen': 10869570, 'train_runtime': '5498', 'train_tokens_per_second': '1977'} +{'loss': '1.321', 'grad_norm': '1.988', 'learning_rate': '4.999e-05', 'epoch': '0.1337', 'num_input_tokens_seen': 10871617, 'train_runtime': '5499', 'train_tokens_per_second': '1977'} +{'loss': '0.7101', 'grad_norm': '1.615', 'learning_rate': '4.999e-05', 'epoch': '0.1337', 'num_input_tokens_seen': 10873664, 'train_runtime': '5500', 'train_tokens_per_second': '1977'} +{'loss': '1.002', 'grad_norm': '1.464', 'learning_rate': '4.999e-05', 'epoch': '0.1338', 'num_input_tokens_seen': 10875711, 'train_runtime': '5501', 'train_tokens_per_second': '1977'} +{'loss': '1.128', 'grad_norm': '1.529', 'learning_rate': '4.999e-05', 'epoch': '0.1338', 'num_input_tokens_seen': 10877758, 'train_runtime': '5502', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.781', 'learning_rate': '4.999e-05', 'epoch': '0.1338', 'num_input_tokens_seen': 10879805, 'train_runtime': '5503', 'train_tokens_per_second': '1977'} +{'loss': '0.7952', 'grad_norm': '1.537', 'learning_rate': '4.999e-05', 'epoch': '0.1338', 'num_input_tokens_seen': 10881852, 'train_runtime': '5504', 'train_tokens_per_second': '1977'} +{'loss': '0.4271', 'grad_norm': '1.028', 'learning_rate': '4.999e-05', 'epoch': '0.1339', 'num_input_tokens_seen': 10883899, 'train_runtime': '5505', 'train_tokens_per_second': '1977'} +{'loss': '0.2941', 'grad_norm': '0.9876', 'learning_rate': '4.999e-05', 'epoch': '0.1339', 'num_input_tokens_seen': 10885946, 'train_runtime': '5506', 'train_tokens_per_second': '1977'} +{'loss': '1.955', 'grad_norm': '2.232', 'learning_rate': '4.999e-05', 'epoch': '0.1339', 'num_input_tokens_seen': 10887993, 'train_runtime': '5507', 'train_tokens_per_second': '1977'} +{'loss': '0.8126', 'grad_norm': '1.779', 'learning_rate': '4.999e-05', 'epoch': '0.1339', 'num_input_tokens_seen': 10890040, 'train_runtime': '5509', 'train_tokens_per_second': '1977'} +{'loss': '1.19', 'grad_norm': '1.916', 'learning_rate': '4.999e-05', 'epoch': '0.134', 'num_input_tokens_seen': 10892087, 'train_runtime': '5510', 'train_tokens_per_second': '1977'} +{'loss': '1.185', 'grad_norm': '2.002', 'learning_rate': '4.999e-05', 'epoch': '0.134', 'num_input_tokens_seen': 10894134, 'train_runtime': '5511', 'train_tokens_per_second': '1977'} +{'loss': '0.4244', 'grad_norm': '1.065', 'learning_rate': '4.999e-05', 'epoch': '0.134', 'num_input_tokens_seen': 10896181, 'train_runtime': '5512', 'train_tokens_per_second': '1977'} +{'loss': '1.351', 'grad_norm': '2.005', 'learning_rate': '4.999e-05', 'epoch': '0.134', 'num_input_tokens_seen': 10898228, 'train_runtime': '5513', 'train_tokens_per_second': '1977'} +{'loss': '0.6287', 'grad_norm': '1.314', 'learning_rate': '4.999e-05', 'epoch': '0.1341', 'num_input_tokens_seen': 10900275, 'train_runtime': '5514', 'train_tokens_per_second': '1977'} +{'loss': '1.208', 'grad_norm': '2.02', 'learning_rate': '4.999e-05', 'epoch': '0.1341', 'num_input_tokens_seen': 10902322, 'train_runtime': '5515', 'train_tokens_per_second': '1977'} +{'loss': '2.003', 'grad_norm': '2.504', 'learning_rate': '4.999e-05', 'epoch': '0.1341', 'num_input_tokens_seen': 10904369, 'train_runtime': '5516', 'train_tokens_per_second': '1977'} +{'loss': '0.6243', 'grad_norm': '1.799', 'learning_rate': '4.999e-05', 'epoch': '0.1341', 'num_input_tokens_seen': 10906416, 'train_runtime': '5517', 'train_tokens_per_second': '1977'} +{'loss': '0.3198', 'grad_norm': '1.076', 'learning_rate': '4.999e-05', 'epoch': '0.1342', 'num_input_tokens_seen': 10908463, 'train_runtime': '5518', 'train_tokens_per_second': '1977'} +{'loss': '0.8018', 'grad_norm': '1.31', 'learning_rate': '4.999e-05', 'epoch': '0.1342', 'num_input_tokens_seen': 10910510, 'train_runtime': '5519', 'train_tokens_per_second': '1977'} +{'loss': '0.4688', 'grad_norm': '1.162', 'learning_rate': '4.999e-05', 'epoch': '0.1342', 'num_input_tokens_seen': 10912557, 'train_runtime': '5520', 'train_tokens_per_second': '1977'} +{'loss': '0.9576', 'grad_norm': '1.845', 'learning_rate': '4.999e-05', 'epoch': '0.1342', 'num_input_tokens_seen': 10914604, 'train_runtime': '5521', 'train_tokens_per_second': '1977'} +{'loss': '0.5728', 'grad_norm': '1.352', 'learning_rate': '4.999e-05', 'epoch': '0.1343', 'num_input_tokens_seen': 10916651, 'train_runtime': '5522', 'train_tokens_per_second': '1977'} +{'loss': '1.116', 'grad_norm': '1.773', 'learning_rate': '4.999e-05', 'epoch': '0.1343', 'num_input_tokens_seen': 10918698, 'train_runtime': '5523', 'train_tokens_per_second': '1977'} +{'loss': '1.84', 'grad_norm': '2.359', 'learning_rate': '4.999e-05', 'epoch': '0.1343', 'num_input_tokens_seen': 10920745, 'train_runtime': '5524', 'train_tokens_per_second': '1977'} +{'loss': '0.7201', 'grad_norm': '1.285', 'learning_rate': '4.999e-05', 'epoch': '0.1344', 'num_input_tokens_seen': 10922792, 'train_runtime': '5525', 'train_tokens_per_second': '1977'} +{'loss': '0.3854', 'grad_norm': '1.143', 'learning_rate': '4.999e-05', 'epoch': '0.1344', 'num_input_tokens_seen': 10924839, 'train_runtime': '5526', 'train_tokens_per_second': '1977'} +{'loss': '0.2765', 'grad_norm': '0.9764', 'learning_rate': '4.999e-05', 'epoch': '0.1344', 'num_input_tokens_seen': 10926886, 'train_runtime': '5527', 'train_tokens_per_second': '1977'} +{'loss': '0.4802', 'grad_norm': '1.125', 'learning_rate': '4.999e-05', 'epoch': '0.1344', 'num_input_tokens_seen': 10928933, 'train_runtime': '5528', 'train_tokens_per_second': '1977'} +{'loss': '0.5918', 'grad_norm': '1.456', 'learning_rate': '4.999e-05', 'epoch': '0.1345', 'num_input_tokens_seen': 10930980, 'train_runtime': '5529', 'train_tokens_per_second': '1977'} +{'loss': '0.6429', 'grad_norm': '1.237', 'learning_rate': '4.999e-05', 'epoch': '0.1345', 'num_input_tokens_seen': 10933027, 'train_runtime': '5530', 'train_tokens_per_second': '1977'} +{'loss': '0.3779', 'grad_norm': '1.119', 'learning_rate': '4.999e-05', 'epoch': '0.1345', 'num_input_tokens_seen': 10935074, 'train_runtime': '5531', 'train_tokens_per_second': '1977'} +{'loss': '0.7852', 'grad_norm': '1.487', 'learning_rate': '4.999e-05', 'epoch': '0.1345', 'num_input_tokens_seen': 10937121, 'train_runtime': '5532', 'train_tokens_per_second': '1977'} +{'loss': '0.2935', 'grad_norm': '1.071', 'learning_rate': '4.999e-05', 'epoch': '0.1346', 'num_input_tokens_seen': 10939168, 'train_runtime': '5533', 'train_tokens_per_second': '1977'} +{'loss': '0.5589', 'grad_norm': '1.353', 'learning_rate': '4.999e-05', 'epoch': '0.1346', 'num_input_tokens_seen': 10941215, 'train_runtime': '5534', 'train_tokens_per_second': '1977'} +{'loss': '0.7956', 'grad_norm': '1.351', 'learning_rate': '4.999e-05', 'epoch': '0.1346', 'num_input_tokens_seen': 10943262, 'train_runtime': '5535', 'train_tokens_per_second': '1977'} +{'loss': '0.3512', 'grad_norm': '1.04', 'learning_rate': '4.999e-05', 'epoch': '0.1346', 'num_input_tokens_seen': 10945309, 'train_runtime': '5536', 'train_tokens_per_second': '1977'} +{'loss': '2.04', 'grad_norm': '2.33', 'learning_rate': '4.999e-05', 'epoch': '0.1347', 'num_input_tokens_seen': 10947356, 'train_runtime': '5538', 'train_tokens_per_second': '1977'} +{'loss': '0.3043', 'grad_norm': '1.151', 'learning_rate': '4.999e-05', 'epoch': '0.1347', 'num_input_tokens_seen': 10949403, 'train_runtime': '5539', 'train_tokens_per_second': '1977'} +{'loss': '1.239', 'grad_norm': '1.938', 'learning_rate': '4.999e-05', 'epoch': '0.1347', 'num_input_tokens_seen': 10951450, 'train_runtime': '5540', 'train_tokens_per_second': '1977'} +{'loss': '0.2419', 'grad_norm': '1.045', 'learning_rate': '4.999e-05', 'epoch': '0.1347', 'num_input_tokens_seen': 10953497, 'train_runtime': '5541', 'train_tokens_per_second': '1977'} +{'loss': '1.002', 'grad_norm': '1.763', 'learning_rate': '4.999e-05', 'epoch': '0.1348', 'num_input_tokens_seen': 10955544, 'train_runtime': '5542', 'train_tokens_per_second': '1977'} +{'loss': '0.8201', 'grad_norm': '1.272', 'learning_rate': '4.999e-05', 'epoch': '0.1348', 'num_input_tokens_seen': 10957591, 'train_runtime': '5543', 'train_tokens_per_second': '1977'} +{'loss': '2.131', 'grad_norm': '2.586', 'learning_rate': '4.999e-05', 'epoch': '0.1348', 'num_input_tokens_seen': 10959638, 'train_runtime': '5544', 'train_tokens_per_second': '1977'} +{'loss': '0.421', 'grad_norm': '1.227', 'learning_rate': '4.999e-05', 'epoch': '0.1348', 'num_input_tokens_seen': 10961685, 'train_runtime': '5545', 'train_tokens_per_second': '1977'} +{'loss': '2.188', 'grad_norm': '2.39', 'learning_rate': '4.999e-05', 'epoch': '0.1349', 'num_input_tokens_seen': 10963732, 'train_runtime': '5546', 'train_tokens_per_second': '1977'} +{'loss': '0.5645', 'grad_norm': '1.645', 'learning_rate': '4.999e-05', 'epoch': '0.1349', 'num_input_tokens_seen': 10965779, 'train_runtime': '5547', 'train_tokens_per_second': '1977'} +{'loss': '0.4225', 'grad_norm': '1.151', 'learning_rate': '4.999e-05', 'epoch': '0.1349', 'num_input_tokens_seen': 10967826, 'train_runtime': '5548', 'train_tokens_per_second': '1977'} +{'loss': '0.5911', 'grad_norm': '1.597', 'learning_rate': '4.999e-05', 'epoch': '0.1349', 'num_input_tokens_seen': 10969873, 'train_runtime': '5549', 'train_tokens_per_second': '1977'} +{'loss': '0.7708', 'grad_norm': '1.46', 'learning_rate': '4.999e-05', 'epoch': '0.135', 'num_input_tokens_seen': 10971920, 'train_runtime': '5550', 'train_tokens_per_second': '1977'} +{'loss': '1.726', 'grad_norm': '2.219', 'learning_rate': '4.999e-05', 'epoch': '0.135', 'num_input_tokens_seen': 10973967, 'train_runtime': '5551', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '1.819', 'learning_rate': '4.999e-05', 'epoch': '0.135', 'num_input_tokens_seen': 10976014, 'train_runtime': '5552', 'train_tokens_per_second': '1977'} +{'loss': '0.5094', 'grad_norm': '1.425', 'learning_rate': '4.999e-05', 'epoch': '0.135', 'num_input_tokens_seen': 10978061, 'train_runtime': '5553', 'train_tokens_per_second': '1977'} +{'loss': '2.438', 'grad_norm': '2.282', 'learning_rate': '4.999e-05', 'epoch': '0.1351', 'num_input_tokens_seen': 10980108, 'train_runtime': '5554', 'train_tokens_per_second': '1977'} +{'loss': '0.4325', 'grad_norm': '1.15', 'learning_rate': '4.999e-05', 'epoch': '0.1351', 'num_input_tokens_seen': 10982155, 'train_runtime': '5555', 'train_tokens_per_second': '1977'} +{'loss': '1.405', 'grad_norm': '2.125', 'learning_rate': '4.999e-05', 'epoch': '0.1351', 'num_input_tokens_seen': 10984202, 'train_runtime': '5556', 'train_tokens_per_second': '1977'} +{'loss': '1.132', 'grad_norm': '1.515', 'learning_rate': '4.999e-05', 'epoch': '0.1351', 'num_input_tokens_seen': 10986249, 'train_runtime': '5557', 'train_tokens_per_second': '1977'} +{'loss': '1.62', 'grad_norm': '1.952', 'learning_rate': '4.999e-05', 'epoch': '0.1352', 'num_input_tokens_seen': 10988296, 'train_runtime': '5558', 'train_tokens_per_second': '1977'} +{'loss': '0.3108', 'grad_norm': '0.9718', 'learning_rate': '4.999e-05', 'epoch': '0.1352', 'num_input_tokens_seen': 10990343, 'train_runtime': '5559', 'train_tokens_per_second': '1977'} +{'loss': '0.2809', 'grad_norm': '1.116', 'learning_rate': '4.999e-05', 'epoch': '0.1352', 'num_input_tokens_seen': 10992390, 'train_runtime': '5560', 'train_tokens_per_second': '1977'} +{'loss': '0.9511', 'grad_norm': '1.655', 'learning_rate': '4.999e-05', 'epoch': '0.1352', 'num_input_tokens_seen': 10994437, 'train_runtime': '5561', 'train_tokens_per_second': '1977'} +{'loss': '0.6808', 'grad_norm': '1.435', 'learning_rate': '4.999e-05', 'epoch': '0.1353', 'num_input_tokens_seen': 10996484, 'train_runtime': '5562', 'train_tokens_per_second': '1977'} +{'loss': '0.4509', 'grad_norm': '1.03', 'learning_rate': '4.999e-05', 'epoch': '0.1353', 'num_input_tokens_seen': 10998531, 'train_runtime': '5563', 'train_tokens_per_second': '1977'} +{'loss': '0.7694', 'grad_norm': '1.895', 'learning_rate': '4.999e-05', 'epoch': '0.1353', 'num_input_tokens_seen': 11000578, 'train_runtime': '5564', 'train_tokens_per_second': '1977'} +{'loss': '1.859', 'grad_norm': '2.094', 'learning_rate': '4.999e-05', 'epoch': '0.1353', 'num_input_tokens_seen': 11002625, 'train_runtime': '5566', 'train_tokens_per_second': '1977'} +{'loss': '0.6066', 'grad_norm': '1.403', 'learning_rate': '4.999e-05', 'epoch': '0.1354', 'num_input_tokens_seen': 11004672, 'train_runtime': '5567', 'train_tokens_per_second': '1977'} +{'loss': '1.018', 'grad_norm': '1.851', 'learning_rate': '4.999e-05', 'epoch': '0.1354', 'num_input_tokens_seen': 11006719, 'train_runtime': '5568', 'train_tokens_per_second': '1977'} +{'loss': '0.686', 'grad_norm': '1.622', 'learning_rate': '4.999e-05', 'epoch': '0.1354', 'num_input_tokens_seen': 11008766, 'train_runtime': '5569', 'train_tokens_per_second': '1977'} +{'loss': '0.7651', 'grad_norm': '1.275', 'learning_rate': '4.999e-05', 'epoch': '0.1354', 'num_input_tokens_seen': 11010813, 'train_runtime': '5570', 'train_tokens_per_second': '1977'} +{'loss': '1.527', 'grad_norm': '2.033', 'learning_rate': '4.999e-05', 'epoch': '0.1355', 'num_input_tokens_seen': 11012860, 'train_runtime': '5571', 'train_tokens_per_second': '1977'} +{'loss': '0.8117', 'grad_norm': '2.165', 'learning_rate': '4.999e-05', 'epoch': '0.1355', 'num_input_tokens_seen': 11014907, 'train_runtime': '5572', 'train_tokens_per_second': '1977'} +{'loss': '0.4653', 'grad_norm': '1.209', 'learning_rate': '4.999e-05', 'epoch': '0.1355', 'num_input_tokens_seen': 11016954, 'train_runtime': '5573', 'train_tokens_per_second': '1977'} +{'loss': '0.9006', 'grad_norm': '1.842', 'learning_rate': '4.999e-05', 'epoch': '0.1355', 'num_input_tokens_seen': 11019001, 'train_runtime': '5574', 'train_tokens_per_second': '1977'} +{'loss': '1.298', 'grad_norm': '2.273', 'learning_rate': '4.999e-05', 'epoch': '0.1356', 'num_input_tokens_seen': 11021048, 'train_runtime': '5575', 'train_tokens_per_second': '1977'} +{'loss': '0.4123', 'grad_norm': '1.147', 'learning_rate': '4.999e-05', 'epoch': '0.1356', 'num_input_tokens_seen': 11023095, 'train_runtime': '5576', 'train_tokens_per_second': '1977'} +{'loss': '0.9423', 'grad_norm': '1.711', 'learning_rate': '4.999e-05', 'epoch': '0.1356', 'num_input_tokens_seen': 11025142, 'train_runtime': '5577', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '1.934', 'learning_rate': '4.999e-05', 'epoch': '0.1356', 'num_input_tokens_seen': 11027189, 'train_runtime': '5578', 'train_tokens_per_second': '1977'} +{'loss': '0.4275', 'grad_norm': '1.147', 'learning_rate': '4.999e-05', 'epoch': '0.1357', 'num_input_tokens_seen': 11029236, 'train_runtime': '5579', 'train_tokens_per_second': '1977'} +{'loss': '1.362', 'grad_norm': '2.059', 'learning_rate': '4.999e-05', 'epoch': '0.1357', 'num_input_tokens_seen': 11031283, 'train_runtime': '5580', 'train_tokens_per_second': '1977'} +{'loss': '0.5109', 'grad_norm': '1.357', 'learning_rate': '4.999e-05', 'epoch': '0.1357', 'num_input_tokens_seen': 11033330, 'train_runtime': '5581', 'train_tokens_per_second': '1977'} +{'loss': '1.668', 'grad_norm': '2.233', 'learning_rate': '4.999e-05', 'epoch': '0.1357', 'num_input_tokens_seen': 11035377, 'train_runtime': '5582', 'train_tokens_per_second': '1977'} +{'loss': '0.6912', 'grad_norm': '1.575', 'learning_rate': '4.999e-05', 'epoch': '0.1358', 'num_input_tokens_seen': 11037424, 'train_runtime': '5583', 'train_tokens_per_second': '1977'} +{'loss': '1.893', 'grad_norm': '2.78', 'learning_rate': '4.999e-05', 'epoch': '0.1358', 'num_input_tokens_seen': 11039471, 'train_runtime': '5584', 'train_tokens_per_second': '1977'} +{'loss': '0.4379', 'grad_norm': '0.9033', 'learning_rate': '4.999e-05', 'epoch': '0.1358', 'num_input_tokens_seen': 11041518, 'train_runtime': '5585', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '2.001', 'learning_rate': '4.999e-05', 'epoch': '0.1358', 'num_input_tokens_seen': 11043565, 'train_runtime': '5586', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '1.645', 'learning_rate': '4.999e-05', 'epoch': '0.1359', 'num_input_tokens_seen': 11045612, 'train_runtime': '5587', 'train_tokens_per_second': '1977'} +{'loss': '1.821', 'grad_norm': '2.596', 'learning_rate': '4.999e-05', 'epoch': '0.1359', 'num_input_tokens_seen': 11047659, 'train_runtime': '5588', 'train_tokens_per_second': '1977'} +{'loss': '1.684', 'grad_norm': '2.028', 'learning_rate': '4.999e-05', 'epoch': '0.1359', 'num_input_tokens_seen': 11049706, 'train_runtime': '5589', 'train_tokens_per_second': '1977'} +{'loss': '0.3421', 'grad_norm': '1.007', 'learning_rate': '4.999e-05', 'epoch': '0.1359', 'num_input_tokens_seen': 11051753, 'train_runtime': '5590', 'train_tokens_per_second': '1977'} +{'loss': '0.5834', 'grad_norm': '1.252', 'learning_rate': '4.999e-05', 'epoch': '0.136', 'num_input_tokens_seen': 11053800, 'train_runtime': '5591', 'train_tokens_per_second': '1977'} +{'loss': '0.3781', 'grad_norm': '1.32', 'learning_rate': '4.999e-05', 'epoch': '0.136', 'num_input_tokens_seen': 11055847, 'train_runtime': '5592', 'train_tokens_per_second': '1977'} +{'loss': '1.071', 'grad_norm': '1.783', 'learning_rate': '4.999e-05', 'epoch': '0.136', 'num_input_tokens_seen': 11057894, 'train_runtime': '5593', 'train_tokens_per_second': '1977'} +{'loss': '0.7913', 'grad_norm': '1.718', 'learning_rate': '4.999e-05', 'epoch': '0.136', 'num_input_tokens_seen': 11059941, 'train_runtime': '5595', 'train_tokens_per_second': '1977'} +{'loss': '0.6676', 'grad_norm': '1.516', 'learning_rate': '4.999e-05', 'epoch': '0.1361', 'num_input_tokens_seen': 11061988, 'train_runtime': '5596', 'train_tokens_per_second': '1977'} +{'loss': '0.3272', 'grad_norm': '1.201', 'learning_rate': '4.999e-05', 'epoch': '0.1361', 'num_input_tokens_seen': 11064035, 'train_runtime': '5597', 'train_tokens_per_second': '1977'} +{'loss': '0.8155', 'grad_norm': '1.194', 'learning_rate': '4.999e-05', 'epoch': '0.1361', 'num_input_tokens_seen': 11066082, 'train_runtime': '5598', 'train_tokens_per_second': '1977'} +{'loss': '0.6961', 'grad_norm': '1.017', 'learning_rate': '4.999e-05', 'epoch': '0.1361', 'num_input_tokens_seen': 11068129, 'train_runtime': '5599', 'train_tokens_per_second': '1977'} +{'loss': '0.3727', 'grad_norm': '1.103', 'learning_rate': '4.999e-05', 'epoch': '0.1362', 'num_input_tokens_seen': 11070176, 'train_runtime': '5600', 'train_tokens_per_second': '1977'} +{'loss': '0.9132', 'grad_norm': '1.577', 'learning_rate': '4.999e-05', 'epoch': '0.1362', 'num_input_tokens_seen': 11072223, 'train_runtime': '5601', 'train_tokens_per_second': '1977'} +{'loss': '0.4674', 'grad_norm': '0.8854', 'learning_rate': '4.999e-05', 'epoch': '0.1362', 'num_input_tokens_seen': 11074270, 'train_runtime': '5602', 'train_tokens_per_second': '1977'} +{'loss': '1.735', 'grad_norm': '2.46', 'learning_rate': '4.999e-05', 'epoch': '0.1362', 'num_input_tokens_seen': 11076317, 'train_runtime': '5603', 'train_tokens_per_second': '1977'} +{'loss': '0.3861', 'grad_norm': '1.12', 'learning_rate': '4.999e-05', 'epoch': '0.1363', 'num_input_tokens_seen': 11078364, 'train_runtime': '5604', 'train_tokens_per_second': '1977'} +{'loss': '0.7738', 'grad_norm': '1.649', 'learning_rate': '4.999e-05', 'epoch': '0.1363', 'num_input_tokens_seen': 11080411, 'train_runtime': '5605', 'train_tokens_per_second': '1977'} +{'loss': '0.4693', 'grad_norm': '0.9659', 'learning_rate': '4.999e-05', 'epoch': '0.1363', 'num_input_tokens_seen': 11082458, 'train_runtime': '5606', 'train_tokens_per_second': '1977'} +{'loss': '0.3848', 'grad_norm': '0.8357', 'learning_rate': '4.999e-05', 'epoch': '0.1363', 'num_input_tokens_seen': 11084505, 'train_runtime': '5607', 'train_tokens_per_second': '1977'} +{'loss': '0.7805', 'grad_norm': '1.551', 'learning_rate': '4.999e-05', 'epoch': '0.1364', 'num_input_tokens_seen': 11086552, 'train_runtime': '5608', 'train_tokens_per_second': '1977'} +{'loss': '0.4927', 'grad_norm': '1.054', 'learning_rate': '4.999e-05', 'epoch': '0.1364', 'num_input_tokens_seen': 11088599, 'train_runtime': '5609', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '1.69', 'learning_rate': '4.999e-05', 'epoch': '0.1364', 'num_input_tokens_seen': 11090646, 'train_runtime': '5610', 'train_tokens_per_second': '1977'} +{'loss': '0.7153', 'grad_norm': '1.29', 'learning_rate': '4.999e-05', 'epoch': '0.1364', 'num_input_tokens_seen': 11092693, 'train_runtime': '5611', 'train_tokens_per_second': '1977'} +{'loss': '0.9563', 'grad_norm': '1.647', 'learning_rate': '4.999e-05', 'epoch': '0.1365', 'num_input_tokens_seen': 11094740, 'train_runtime': '5612', 'train_tokens_per_second': '1977'} +{'loss': '1.499', 'grad_norm': '2.022', 'learning_rate': '4.999e-05', 'epoch': '0.1365', 'num_input_tokens_seen': 11096787, 'train_runtime': '5613', 'train_tokens_per_second': '1977'} +{'loss': '0.2702', 'grad_norm': '0.9881', 'learning_rate': '4.999e-05', 'epoch': '0.1365', 'num_input_tokens_seen': 11098834, 'train_runtime': '5614', 'train_tokens_per_second': '1977'} +{'loss': '0.2481', 'grad_norm': '0.9866', 'learning_rate': '4.999e-05', 'epoch': '0.1365', 'num_input_tokens_seen': 11100881, 'train_runtime': '5615', 'train_tokens_per_second': '1977'} +{'loss': '0.3907', 'grad_norm': '1.236', 'learning_rate': '4.999e-05', 'epoch': '0.1366', 'num_input_tokens_seen': 11102928, 'train_runtime': '5616', 'train_tokens_per_second': '1977'} +{'loss': '0.694', 'grad_norm': '1.075', 'learning_rate': '4.999e-05', 'epoch': '0.1366', 'num_input_tokens_seen': 11104975, 'train_runtime': '5617', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.349', 'learning_rate': '4.999e-05', 'epoch': '0.1366', 'num_input_tokens_seen': 11107022, 'train_runtime': '5618', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '2.149', 'learning_rate': '4.999e-05', 'epoch': '0.1366', 'num_input_tokens_seen': 11109069, 'train_runtime': '5619', 'train_tokens_per_second': '1977'} +{'loss': '0.6218', 'grad_norm': '1.57', 'learning_rate': '4.999e-05', 'epoch': '0.1367', 'num_input_tokens_seen': 11111116, 'train_runtime': '5620', 'train_tokens_per_second': '1977'} +{'loss': '0.8449', 'grad_norm': '1.651', 'learning_rate': '4.999e-05', 'epoch': '0.1367', 'num_input_tokens_seen': 11113163, 'train_runtime': '5621', 'train_tokens_per_second': '1977'} +{'loss': '0.373', 'grad_norm': '1.146', 'learning_rate': '4.999e-05', 'epoch': '0.1367', 'num_input_tokens_seen': 11115210, 'train_runtime': '5623', 'train_tokens_per_second': '1977'} +{'loss': '1.46', 'grad_norm': '2.13', 'learning_rate': '4.999e-05', 'epoch': '0.1367', 'num_input_tokens_seen': 11117257, 'train_runtime': '5624', 'train_tokens_per_second': '1977'} +{'loss': '0.3163', 'grad_norm': '1.399', 'learning_rate': '4.999e-05', 'epoch': '0.1368', 'num_input_tokens_seen': 11119304, 'train_runtime': '5625', 'train_tokens_per_second': '1977'} +{'loss': '0.394', 'grad_norm': '1.133', 'learning_rate': '4.999e-05', 'epoch': '0.1368', 'num_input_tokens_seen': 11121351, 'train_runtime': '5626', 'train_tokens_per_second': '1977'} +{'loss': '2.214', 'grad_norm': '2.879', 'learning_rate': '4.999e-05', 'epoch': '0.1368', 'num_input_tokens_seen': 11123398, 'train_runtime': '5627', 'train_tokens_per_second': '1977'} +{'loss': '0.9455', 'grad_norm': '1.552', 'learning_rate': '4.999e-05', 'epoch': '0.1368', 'num_input_tokens_seen': 11125445, 'train_runtime': '5628', 'train_tokens_per_second': '1977'} +{'loss': '0.4619', 'grad_norm': '1.153', 'learning_rate': '4.999e-05', 'epoch': '0.1369', 'num_input_tokens_seen': 11127492, 'train_runtime': '5629', 'train_tokens_per_second': '1977'} +{'loss': '0.4226', 'grad_norm': '1.117', 'learning_rate': '4.999e-05', 'epoch': '0.1369', 'num_input_tokens_seen': 11129539, 'train_runtime': '5630', 'train_tokens_per_second': '1977'} +{'loss': '0.3525', 'grad_norm': '1.087', 'learning_rate': '4.999e-05', 'epoch': '0.1369', 'num_input_tokens_seen': 11131586, 'train_runtime': '5631', 'train_tokens_per_second': '1977'} +{'loss': '0.9726', 'grad_norm': '1.706', 'learning_rate': '4.999e-05', 'epoch': '0.1369', 'num_input_tokens_seen': 11133633, 'train_runtime': '5632', 'train_tokens_per_second': '1977'} +{'loss': '0.6669', 'grad_norm': '1.331', 'learning_rate': '4.999e-05', 'epoch': '0.137', 'num_input_tokens_seen': 11135680, 'train_runtime': '5633', 'train_tokens_per_second': '1977'} +{'loss': '0.5835', 'grad_norm': '1.261', 'learning_rate': '4.999e-05', 'epoch': '0.137', 'num_input_tokens_seen': 11137727, 'train_runtime': '5634', 'train_tokens_per_second': '1977'} +{'loss': '1.055', 'grad_norm': '1.631', 'learning_rate': '4.999e-05', 'epoch': '0.137', 'num_input_tokens_seen': 11139774, 'train_runtime': '5635', 'train_tokens_per_second': '1977'} +{'loss': '0.9259', 'grad_norm': '1.632', 'learning_rate': '4.999e-05', 'epoch': '0.137', 'num_input_tokens_seen': 11141821, 'train_runtime': '5636', 'train_tokens_per_second': '1977'} +{'loss': '0.3845', 'grad_norm': '1.006', 'learning_rate': '4.999e-05', 'epoch': '0.1371', 'num_input_tokens_seen': 11143868, 'train_runtime': '5637', 'train_tokens_per_second': '1977'} +{'loss': '0.4415', 'grad_norm': '1.405', 'learning_rate': '4.999e-05', 'epoch': '0.1371', 'num_input_tokens_seen': 11145915, 'train_runtime': '5638', 'train_tokens_per_second': '1977'} +{'loss': '0.858', 'grad_norm': '1.361', 'learning_rate': '4.999e-05', 'epoch': '0.1371', 'num_input_tokens_seen': 11147962, 'train_runtime': '5639', 'train_tokens_per_second': '1977'} +{'loss': '0.3925', 'grad_norm': '1.129', 'learning_rate': '4.999e-05', 'epoch': '0.1371', 'num_input_tokens_seen': 11150009, 'train_runtime': '5640', 'train_tokens_per_second': '1977'} +{'loss': '0.8817', 'grad_norm': '1.464', 'learning_rate': '4.999e-05', 'epoch': '0.1372', 'num_input_tokens_seen': 11152056, 'train_runtime': '5641', 'train_tokens_per_second': '1977'} +{'loss': '1.181', 'grad_norm': '2.083', 'learning_rate': '4.999e-05', 'epoch': '0.1372', 'num_input_tokens_seen': 11154103, 'train_runtime': '5642', 'train_tokens_per_second': '1977'} +{'loss': '0.5226', 'grad_norm': '1.336', 'learning_rate': '4.999e-05', 'epoch': '0.1372', 'num_input_tokens_seen': 11156150, 'train_runtime': '5643', 'train_tokens_per_second': '1977'} +{'loss': '0.3504', 'grad_norm': '1.107', 'learning_rate': '4.999e-05', 'epoch': '0.1372', 'num_input_tokens_seen': 11158197, 'train_runtime': '5644', 'train_tokens_per_second': '1977'} +{'loss': '0.5332', 'grad_norm': '1.33', 'learning_rate': '4.999e-05', 'epoch': '0.1373', 'num_input_tokens_seen': 11160244, 'train_runtime': '5646', 'train_tokens_per_second': '1977'} +{'loss': '0.7264', 'grad_norm': '1.595', 'learning_rate': '4.999e-05', 'epoch': '0.1373', 'num_input_tokens_seen': 11162291, 'train_runtime': '5647', 'train_tokens_per_second': '1977'} +{'loss': '0.8233', 'grad_norm': '1.426', 'learning_rate': '4.999e-05', 'epoch': '0.1373', 'num_input_tokens_seen': 11164338, 'train_runtime': '5648', 'train_tokens_per_second': '1977'} +{'loss': '0.7603', 'grad_norm': '1.692', 'learning_rate': '4.999e-05', 'epoch': '0.1373', 'num_input_tokens_seen': 11166385, 'train_runtime': '5649', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.962', 'learning_rate': '4.999e-05', 'epoch': '0.1374', 'num_input_tokens_seen': 11168432, 'train_runtime': '5650', 'train_tokens_per_second': '1977'} +{'loss': '0.288', 'grad_norm': '0.9487', 'learning_rate': '4.999e-05', 'epoch': '0.1374', 'num_input_tokens_seen': 11170479, 'train_runtime': '5651', 'train_tokens_per_second': '1977'} +{'loss': '0.5835', 'grad_norm': '1.426', 'learning_rate': '4.999e-05', 'epoch': '0.1374', 'num_input_tokens_seen': 11172526, 'train_runtime': '5652', 'train_tokens_per_second': '1977'} +{'loss': '0.6873', 'grad_norm': '1.682', 'learning_rate': '4.999e-05', 'epoch': '0.1374', 'num_input_tokens_seen': 11174573, 'train_runtime': '5653', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '1.931', 'learning_rate': '4.999e-05', 'epoch': '0.1375', 'num_input_tokens_seen': 11176620, 'train_runtime': '5654', 'train_tokens_per_second': '1977'} +{'loss': '0.5674', 'grad_norm': '1.584', 'learning_rate': '4.999e-05', 'epoch': '0.1375', 'num_input_tokens_seen': 11178667, 'train_runtime': '5655', 'train_tokens_per_second': '1977'} +{'loss': '1.345', 'grad_norm': '2.053', 'learning_rate': '4.999e-05', 'epoch': '0.1375', 'num_input_tokens_seen': 11180714, 'train_runtime': '5656', 'train_tokens_per_second': '1977'} +{'loss': '0.5262', 'grad_norm': '1.195', 'learning_rate': '4.999e-05', 'epoch': '0.1375', 'num_input_tokens_seen': 11182761, 'train_runtime': '5657', 'train_tokens_per_second': '1977'} +{'loss': '1.84', 'grad_norm': '2.516', 'learning_rate': '4.999e-05', 'epoch': '0.1376', 'num_input_tokens_seen': 11184808, 'train_runtime': '5658', 'train_tokens_per_second': '1977'} +{'loss': '0.6184', 'grad_norm': '1.449', 'learning_rate': '4.999e-05', 'epoch': '0.1376', 'num_input_tokens_seen': 11186855, 'train_runtime': '5659', 'train_tokens_per_second': '1977'} +{'loss': '0.8915', 'grad_norm': '1.442', 'learning_rate': '4.999e-05', 'epoch': '0.1376', 'num_input_tokens_seen': 11188902, 'train_runtime': '5660', 'train_tokens_per_second': '1977'} +{'loss': '1.223', 'grad_norm': '2.045', 'learning_rate': '4.999e-05', 'epoch': '0.1376', 'num_input_tokens_seen': 11190949, 'train_runtime': '5661', 'train_tokens_per_second': '1977'} +{'loss': '1.019', 'grad_norm': '1.826', 'learning_rate': '4.999e-05', 'epoch': '0.1377', 'num_input_tokens_seen': 11192996, 'train_runtime': '5662', 'train_tokens_per_second': '1977'} +{'loss': '0.8565', 'grad_norm': '1.563', 'learning_rate': '4.999e-05', 'epoch': '0.1377', 'num_input_tokens_seen': 11195043, 'train_runtime': '5663', 'train_tokens_per_second': '1977'} +{'loss': '2.349', 'grad_norm': '2.806', 'learning_rate': '4.999e-05', 'epoch': '0.1377', 'num_input_tokens_seen': 11197090, 'train_runtime': '5664', 'train_tokens_per_second': '1977'} +{'loss': '0.3845', 'grad_norm': '1.111', 'learning_rate': '4.999e-05', 'epoch': '0.1377', 'num_input_tokens_seen': 11199137, 'train_runtime': '5665', 'train_tokens_per_second': '1977'} +{'loss': '0.4017', 'grad_norm': '1.319', 'learning_rate': '4.999e-05', 'epoch': '0.1378', 'num_input_tokens_seen': 11201184, 'train_runtime': '5666', 'train_tokens_per_second': '1977'} +{'loss': '0.5329', 'grad_norm': '1.315', 'learning_rate': '4.999e-05', 'epoch': '0.1378', 'num_input_tokens_seen': 11203231, 'train_runtime': '5667', 'train_tokens_per_second': '1977'} +{'loss': '1.007', 'grad_norm': '1.641', 'learning_rate': '4.999e-05', 'epoch': '0.1378', 'num_input_tokens_seen': 11205278, 'train_runtime': '5668', 'train_tokens_per_second': '1977'} +{'loss': '1.098', 'grad_norm': '1.731', 'learning_rate': '4.999e-05', 'epoch': '0.1379', 'num_input_tokens_seen': 11207325, 'train_runtime': '5669', 'train_tokens_per_second': '1977'} +{'loss': '0.8796', 'grad_norm': '1.668', 'learning_rate': '4.999e-05', 'epoch': '0.1379', 'num_input_tokens_seen': 11209372, 'train_runtime': '5670', 'train_tokens_per_second': '1977'} +{'loss': '0.4595', 'grad_norm': '1.166', 'learning_rate': '4.999e-05', 'epoch': '0.1379', 'num_input_tokens_seen': 11211419, 'train_runtime': '5671', 'train_tokens_per_second': '1977'} +{'loss': '0.5193', 'grad_norm': '1.097', 'learning_rate': '4.999e-05', 'epoch': '0.1379', 'num_input_tokens_seen': 11213466, 'train_runtime': '5672', 'train_tokens_per_second': '1977'} +{'loss': '0.5332', 'grad_norm': '1.116', 'learning_rate': '4.999e-05', 'epoch': '0.138', 'num_input_tokens_seen': 11215513, 'train_runtime': '5673', 'train_tokens_per_second': '1977'} +{'loss': '1.929', 'grad_norm': '2.448', 'learning_rate': '4.999e-05', 'epoch': '0.138', 'num_input_tokens_seen': 11217560, 'train_runtime': '5675', 'train_tokens_per_second': '1977'} +{'loss': '0.4303', 'grad_norm': '0.7626', 'learning_rate': '4.999e-05', 'epoch': '0.138', 'num_input_tokens_seen': 11219607, 'train_runtime': '5676', 'train_tokens_per_second': '1977'} +{'loss': '0.4958', 'grad_norm': '1.152', 'learning_rate': '4.999e-05', 'epoch': '0.138', 'num_input_tokens_seen': 11221654, 'train_runtime': '5677', 'train_tokens_per_second': '1977'} +{'loss': '0.4436', 'grad_norm': '1.026', 'learning_rate': '4.999e-05', 'epoch': '0.1381', 'num_input_tokens_seen': 11223701, 'train_runtime': '5678', 'train_tokens_per_second': '1977'} +{'loss': '0.7222', 'grad_norm': '1.376', 'learning_rate': '4.999e-05', 'epoch': '0.1381', 'num_input_tokens_seen': 11225748, 'train_runtime': '5679', 'train_tokens_per_second': '1977'} +{'loss': '0.5899', 'grad_norm': '1.332', 'learning_rate': '4.999e-05', 'epoch': '0.1381', 'num_input_tokens_seen': 11227795, 'train_runtime': '5680', 'train_tokens_per_second': '1977'} +{'loss': '0.6184', 'grad_norm': '1.636', 'learning_rate': '4.999e-05', 'epoch': '0.1381', 'num_input_tokens_seen': 11229842, 'train_runtime': '5681', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '2.754', 'learning_rate': '4.999e-05', 'epoch': '0.1382', 'num_input_tokens_seen': 11231889, 'train_runtime': '5682', 'train_tokens_per_second': '1977'} +{'loss': '1.108', 'grad_norm': '1.618', 'learning_rate': '4.999e-05', 'epoch': '0.1382', 'num_input_tokens_seen': 11233936, 'train_runtime': '5683', 'train_tokens_per_second': '1977'} +{'loss': '0.8862', 'grad_norm': '1.732', 'learning_rate': '4.999e-05', 'epoch': '0.1382', 'num_input_tokens_seen': 11235983, 'train_runtime': '5684', 'train_tokens_per_second': '1977'} +{'loss': '1.262', 'grad_norm': '1.712', 'learning_rate': '4.999e-05', 'epoch': '0.1382', 'num_input_tokens_seen': 11238030, 'train_runtime': '5685', 'train_tokens_per_second': '1977'} +{'loss': '0.6845', 'grad_norm': '1.339', 'learning_rate': '4.999e-05', 'epoch': '0.1383', 'num_input_tokens_seen': 11240077, 'train_runtime': '5686', 'train_tokens_per_second': '1977'} +{'loss': '0.431', 'grad_norm': '0.9676', 'learning_rate': '4.999e-05', 'epoch': '0.1383', 'num_input_tokens_seen': 11242124, 'train_runtime': '5687', 'train_tokens_per_second': '1977'} +{'loss': '0.7998', 'grad_norm': '1.521', 'learning_rate': '4.999e-05', 'epoch': '0.1383', 'num_input_tokens_seen': 11244171, 'train_runtime': '5688', 'train_tokens_per_second': '1977'} +{'loss': '1.557', 'grad_norm': '2.702', 'learning_rate': '4.999e-05', 'epoch': '0.1383', 'num_input_tokens_seen': 11246218, 'train_runtime': '5689', 'train_tokens_per_second': '1977'} +{'loss': '0.3198', 'grad_norm': '0.93', 'learning_rate': '4.999e-05', 'epoch': '0.1384', 'num_input_tokens_seen': 11248265, 'train_runtime': '5690', 'train_tokens_per_second': '1977'} +{'loss': '0.5441', 'grad_norm': '1.561', 'learning_rate': '4.999e-05', 'epoch': '0.1384', 'num_input_tokens_seen': 11250312, 'train_runtime': '5691', 'train_tokens_per_second': '1977'} +{'loss': '1.433', 'grad_norm': '2.082', 'learning_rate': '4.999e-05', 'epoch': '0.1384', 'num_input_tokens_seen': 11252359, 'train_runtime': '5692', 'train_tokens_per_second': '1977'} +{'loss': '0.6659', 'grad_norm': '1.517', 'learning_rate': '4.999e-05', 'epoch': '0.1384', 'num_input_tokens_seen': 11254406, 'train_runtime': '5693', 'train_tokens_per_second': '1977'} +{'loss': '0.3198', 'grad_norm': '1.153', 'learning_rate': '4.999e-05', 'epoch': '0.1385', 'num_input_tokens_seen': 11256453, 'train_runtime': '5694', 'train_tokens_per_second': '1977'} +{'loss': '1.937', 'grad_norm': '2.343', 'learning_rate': '4.999e-05', 'epoch': '0.1385', 'num_input_tokens_seen': 11258500, 'train_runtime': '5695', 'train_tokens_per_second': '1977'} +{'loss': '0.6835', 'grad_norm': '1.827', 'learning_rate': '4.999e-05', 'epoch': '0.1385', 'num_input_tokens_seen': 11260547, 'train_runtime': '5696', 'train_tokens_per_second': '1977'} +{'loss': '0.653', 'grad_norm': '1.344', 'learning_rate': '4.999e-05', 'epoch': '0.1385', 'num_input_tokens_seen': 11262594, 'train_runtime': '5697', 'train_tokens_per_second': '1977'} +{'loss': '1.101', 'grad_norm': '1.645', 'learning_rate': '4.999e-05', 'epoch': '0.1386', 'num_input_tokens_seen': 11264641, 'train_runtime': '5698', 'train_tokens_per_second': '1977'} +{'loss': '0.924', 'grad_norm': '1.729', 'learning_rate': '4.999e-05', 'epoch': '0.1386', 'num_input_tokens_seen': 11266688, 'train_runtime': '5699', 'train_tokens_per_second': '1977'} +{'loss': '0.7716', 'grad_norm': '1.515', 'learning_rate': '4.999e-05', 'epoch': '0.1386', 'num_input_tokens_seen': 11268735, 'train_runtime': '5700', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '1.81', 'learning_rate': '4.999e-05', 'epoch': '0.1386', 'num_input_tokens_seen': 11270782, 'train_runtime': '5701', 'train_tokens_per_second': '1977'} +{'loss': '2.087', 'grad_norm': '2.654', 'learning_rate': '4.999e-05', 'epoch': '0.1387', 'num_input_tokens_seen': 11272829, 'train_runtime': '5702', 'train_tokens_per_second': '1977'} +{'loss': '0.3386', 'grad_norm': '0.9861', 'learning_rate': '4.999e-05', 'epoch': '0.1387', 'num_input_tokens_seen': 11274876, 'train_runtime': '5704', 'train_tokens_per_second': '1977'} +{'loss': '1.378', 'grad_norm': '2.058', 'learning_rate': '4.999e-05', 'epoch': '0.1387', 'num_input_tokens_seen': 11276923, 'train_runtime': '5705', 'train_tokens_per_second': '1977'} +{'loss': '0.3804', 'grad_norm': '1.146', 'learning_rate': '4.999e-05', 'epoch': '0.1387', 'num_input_tokens_seen': 11278970, 'train_runtime': '5706', 'train_tokens_per_second': '1977'} +{'loss': '0.5901', 'grad_norm': '1.451', 'learning_rate': '4.999e-05', 'epoch': '0.1388', 'num_input_tokens_seen': 11281017, 'train_runtime': '5707', 'train_tokens_per_second': '1977'} +{'loss': '0.8015', 'grad_norm': '1.787', 'learning_rate': '4.999e-05', 'epoch': '0.1388', 'num_input_tokens_seen': 11283064, 'train_runtime': '5708', 'train_tokens_per_second': '1977'} +{'loss': '0.3305', 'grad_norm': '1.362', 'learning_rate': '4.999e-05', 'epoch': '0.1388', 'num_input_tokens_seen': 11285111, 'train_runtime': '5709', 'train_tokens_per_second': '1977'} +{'loss': '0.2903', 'grad_norm': '1.28', 'learning_rate': '4.999e-05', 'epoch': '0.1388', 'num_input_tokens_seen': 11287158, 'train_runtime': '5710', 'train_tokens_per_second': '1977'} +{'loss': '0.7741', 'grad_norm': '1.514', 'learning_rate': '4.999e-05', 'epoch': '0.1389', 'num_input_tokens_seen': 11289205, 'train_runtime': '5711', 'train_tokens_per_second': '1977'} +{'loss': '0.4004', 'grad_norm': '1.076', 'learning_rate': '4.999e-05', 'epoch': '0.1389', 'num_input_tokens_seen': 11291252, 'train_runtime': '5712', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.207', 'learning_rate': '4.999e-05', 'epoch': '0.1389', 'num_input_tokens_seen': 11293299, 'train_runtime': '5713', 'train_tokens_per_second': '1977'} +{'loss': '0.384', 'grad_norm': '0.8925', 'learning_rate': '4.999e-05', 'epoch': '0.1389', 'num_input_tokens_seen': 11295346, 'train_runtime': '5714', 'train_tokens_per_second': '1977'} +{'loss': '0.6075', 'grad_norm': '1.35', 'learning_rate': '4.999e-05', 'epoch': '0.139', 'num_input_tokens_seen': 11297393, 'train_runtime': '5715', 'train_tokens_per_second': '1977'} +{'loss': '1.386', 'grad_norm': '2.137', 'learning_rate': '4.999e-05', 'epoch': '0.139', 'num_input_tokens_seen': 11299440, 'train_runtime': '5716', 'train_tokens_per_second': '1977'} +{'loss': '1.609', 'grad_norm': '2.49', 'learning_rate': '4.999e-05', 'epoch': '0.139', 'num_input_tokens_seen': 11301487, 'train_runtime': '5717', 'train_tokens_per_second': '1977'} +{'loss': '0.3838', 'grad_norm': '1.462', 'learning_rate': '4.999e-05', 'epoch': '0.139', 'num_input_tokens_seen': 11303534, 'train_runtime': '5718', 'train_tokens_per_second': '1977'} +{'loss': '0.4625', 'grad_norm': '1.274', 'learning_rate': '4.999e-05', 'epoch': '0.1391', 'num_input_tokens_seen': 11305581, 'train_runtime': '5719', 'train_tokens_per_second': '1977'} +{'loss': '0.8172', 'grad_norm': '1.557', 'learning_rate': '4.999e-05', 'epoch': '0.1391', 'num_input_tokens_seen': 11307628, 'train_runtime': '5720', 'train_tokens_per_second': '1977'} +{'loss': '1.418', 'grad_norm': '2.127', 'learning_rate': '4.999e-05', 'epoch': '0.1391', 'num_input_tokens_seen': 11309675, 'train_runtime': '5721', 'train_tokens_per_second': '1977'} +{'loss': '0.3913', 'grad_norm': '1.234', 'learning_rate': '4.999e-05', 'epoch': '0.1391', 'num_input_tokens_seen': 11311722, 'train_runtime': '5722', 'train_tokens_per_second': '1977'} +{'loss': '0.5462', 'grad_norm': '1.509', 'learning_rate': '4.999e-05', 'epoch': '0.1392', 'num_input_tokens_seen': 11313769, 'train_runtime': '5723', 'train_tokens_per_second': '1977'} +{'loss': '0.4855', 'grad_norm': '1.216', 'learning_rate': '4.999e-05', 'epoch': '0.1392', 'num_input_tokens_seen': 11315816, 'train_runtime': '5724', 'train_tokens_per_second': '1977'} +{'loss': '1.24', 'grad_norm': '1.678', 'learning_rate': '4.999e-05', 'epoch': '0.1392', 'num_input_tokens_seen': 11317863, 'train_runtime': '5725', 'train_tokens_per_second': '1977'} +{'loss': '1.084', 'grad_norm': '1.947', 'learning_rate': '4.999e-05', 'epoch': '0.1392', 'num_input_tokens_seen': 11319910, 'train_runtime': '5726', 'train_tokens_per_second': '1977'} +{'loss': '0.3677', 'grad_norm': '1.258', 'learning_rate': '4.999e-05', 'epoch': '0.1393', 'num_input_tokens_seen': 11321957, 'train_runtime': '5727', 'train_tokens_per_second': '1977'} +{'loss': '1.438', 'grad_norm': '1.974', 'learning_rate': '4.999e-05', 'epoch': '0.1393', 'num_input_tokens_seen': 11324004, 'train_runtime': '5728', 'train_tokens_per_second': '1977'} +{'loss': '0.3953', 'grad_norm': '1.344', 'learning_rate': '4.999e-05', 'epoch': '0.1393', 'num_input_tokens_seen': 11326051, 'train_runtime': '5729', 'train_tokens_per_second': '1977'} +{'loss': '0.6202', 'grad_norm': '1.27', 'learning_rate': '4.999e-05', 'epoch': '0.1393', 'num_input_tokens_seen': 11328098, 'train_runtime': '5730', 'train_tokens_per_second': '1977'} +{'loss': '1.079', 'grad_norm': '1.698', 'learning_rate': '4.999e-05', 'epoch': '0.1394', 'num_input_tokens_seen': 11330145, 'train_runtime': '5731', 'train_tokens_per_second': '1977'} +{'loss': '0.4226', 'grad_norm': '1.264', 'learning_rate': '4.999e-05', 'epoch': '0.1394', 'num_input_tokens_seen': 11332192, 'train_runtime': '5732', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '1.849', 'learning_rate': '4.999e-05', 'epoch': '0.1394', 'num_input_tokens_seen': 11334239, 'train_runtime': '5734', 'train_tokens_per_second': '1977'} +{'loss': '0.6864', 'grad_norm': '1.305', 'learning_rate': '4.999e-05', 'epoch': '0.1394', 'num_input_tokens_seen': 11336286, 'train_runtime': '5735', 'train_tokens_per_second': '1977'} +{'loss': '3.069', 'grad_norm': '2.381', 'learning_rate': '4.999e-05', 'epoch': '0.1395', 'num_input_tokens_seen': 11338333, 'train_runtime': '5736', 'train_tokens_per_second': '1977'} +{'loss': '0.6845', 'grad_norm': '1.405', 'learning_rate': '4.999e-05', 'epoch': '0.1395', 'num_input_tokens_seen': 11340380, 'train_runtime': '5737', 'train_tokens_per_second': '1977'} +{'loss': '0.3713', 'grad_norm': '1.315', 'learning_rate': '4.999e-05', 'epoch': '0.1395', 'num_input_tokens_seen': 11342427, 'train_runtime': '5738', 'train_tokens_per_second': '1977'} +{'loss': '1.746', 'grad_norm': '2.186', 'learning_rate': '4.999e-05', 'epoch': '0.1395', 'num_input_tokens_seen': 11344474, 'train_runtime': '5739', 'train_tokens_per_second': '1977'} +{'loss': '1.634', 'grad_norm': '1.971', 'learning_rate': '4.999e-05', 'epoch': '0.1396', 'num_input_tokens_seen': 11346521, 'train_runtime': '5740', 'train_tokens_per_second': '1977'} +{'loss': '1.193', 'grad_norm': '1.644', 'learning_rate': '4.999e-05', 'epoch': '0.1396', 'num_input_tokens_seen': 11348568, 'train_runtime': '5741', 'train_tokens_per_second': '1977'} +{'loss': '1.845', 'grad_norm': '2.125', 'learning_rate': '4.999e-05', 'epoch': '0.1396', 'num_input_tokens_seen': 11350615, 'train_runtime': '5742', 'train_tokens_per_second': '1977'} +{'loss': '0.3239', 'grad_norm': '1.053', 'learning_rate': '4.999e-05', 'epoch': '0.1396', 'num_input_tokens_seen': 11352662, 'train_runtime': '5743', 'train_tokens_per_second': '1977'} +{'loss': '0.5449', 'grad_norm': '1.174', 'learning_rate': '4.999e-05', 'epoch': '0.1397', 'num_input_tokens_seen': 11354709, 'train_runtime': '5744', 'train_tokens_per_second': '1977'} +{'loss': '1.324', 'grad_norm': '2.008', 'learning_rate': '4.999e-05', 'epoch': '0.1397', 'num_input_tokens_seen': 11356756, 'train_runtime': '5745', 'train_tokens_per_second': '1977'} +{'loss': '0.5512', 'grad_norm': '1.42', 'learning_rate': '4.999e-05', 'epoch': '0.1397', 'num_input_tokens_seen': 11358803, 'train_runtime': '5746', 'train_tokens_per_second': '1977'} +{'loss': '1.225', 'grad_norm': '2.258', 'learning_rate': '4.999e-05', 'epoch': '0.1397', 'num_input_tokens_seen': 11360850, 'train_runtime': '5747', 'train_tokens_per_second': '1977'} +{'loss': '0.9715', 'grad_norm': '1.652', 'learning_rate': '4.999e-05', 'epoch': '0.1398', 'num_input_tokens_seen': 11362897, 'train_runtime': '5748', 'train_tokens_per_second': '1977'} +{'loss': '0.8223', 'grad_norm': '1.125', 'learning_rate': '4.999e-05', 'epoch': '0.1398', 'num_input_tokens_seen': 11364944, 'train_runtime': '5749', 'train_tokens_per_second': '1977'} +{'loss': '0.3055', 'grad_norm': '0.8937', 'learning_rate': '4.999e-05', 'epoch': '0.1398', 'num_input_tokens_seen': 11366991, 'train_runtime': '5750', 'train_tokens_per_second': '1977'} +{'loss': '1.261', 'grad_norm': '1.828', 'learning_rate': '4.999e-05', 'epoch': '0.1398', 'num_input_tokens_seen': 11369038, 'train_runtime': '5751', 'train_tokens_per_second': '1977'} +{'loss': '1.266', 'grad_norm': '2.178', 'learning_rate': '4.999e-05', 'epoch': '0.1399', 'num_input_tokens_seen': 11371085, 'train_runtime': '5752', 'train_tokens_per_second': '1977'} +{'loss': '0.9549', 'grad_norm': '1.831', 'learning_rate': '4.999e-05', 'epoch': '0.1399', 'num_input_tokens_seen': 11373132, 'train_runtime': '5753', 'train_tokens_per_second': '1977'} +{'loss': '0.3036', 'grad_norm': '1.102', 'learning_rate': '4.999e-05', 'epoch': '0.1399', 'num_input_tokens_seen': 11375179, 'train_runtime': '5754', 'train_tokens_per_second': '1977'} +{'loss': '0.3216', 'grad_norm': '1.608', 'learning_rate': '4.999e-05', 'epoch': '0.1399', 'num_input_tokens_seen': 11377226, 'train_runtime': '5755', 'train_tokens_per_second': '1977'} +{'loss': '0.2625', 'grad_norm': '1.023', 'learning_rate': '4.999e-05', 'epoch': '0.14', 'num_input_tokens_seen': 11379273, 'train_runtime': '5756', 'train_tokens_per_second': '1977'} +{'loss': '1.204', 'grad_norm': '2.537', 'learning_rate': '4.999e-05', 'epoch': '0.14', 'num_input_tokens_seen': 11381320, 'train_runtime': '5757', 'train_tokens_per_second': '1977'} +{'loss': '0.7235', 'grad_norm': '1.152', 'learning_rate': '4.999e-05', 'epoch': '0.14', 'num_input_tokens_seen': 11383367, 'train_runtime': '5758', 'train_tokens_per_second': '1977'} +{'loss': '1.674', 'grad_norm': '2.369', 'learning_rate': '4.999e-05', 'epoch': '0.14', 'num_input_tokens_seen': 11385414, 'train_runtime': '5759', 'train_tokens_per_second': '1977'} +{'loss': '0.4026', 'grad_norm': '1.039', 'learning_rate': '4.999e-05', 'epoch': '0.1401', 'num_input_tokens_seen': 11387461, 'train_runtime': '5760', 'train_tokens_per_second': '1977'} +{'loss': '1.925', 'grad_norm': '2.327', 'learning_rate': '4.999e-05', 'epoch': '0.1401', 'num_input_tokens_seen': 11389508, 'train_runtime': '5761', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '2.063', 'learning_rate': '4.999e-05', 'epoch': '0.1401', 'num_input_tokens_seen': 11391555, 'train_runtime': '5762', 'train_tokens_per_second': '1977'} +{'loss': '0.6988', 'grad_norm': '1.563', 'learning_rate': '4.999e-05', 'epoch': '0.1401', 'num_input_tokens_seen': 11393602, 'train_runtime': '5763', 'train_tokens_per_second': '1977'} +{'loss': '0.9278', 'grad_norm': '1.58', 'learning_rate': '4.999e-05', 'epoch': '0.1402', 'num_input_tokens_seen': 11395649, 'train_runtime': '5765', 'train_tokens_per_second': '1977'} +{'loss': '0.2552', 'grad_norm': '1.06', 'learning_rate': '4.999e-05', 'epoch': '0.1402', 'num_input_tokens_seen': 11397696, 'train_runtime': '5766', 'train_tokens_per_second': '1977'} +{'loss': '0.4123', 'grad_norm': '1.081', 'learning_rate': '4.999e-05', 'epoch': '0.1402', 'num_input_tokens_seen': 11399743, 'train_runtime': '5767', 'train_tokens_per_second': '1977'} +{'loss': '0.9622', 'grad_norm': '1.322', 'learning_rate': '4.999e-05', 'epoch': '0.1402', 'num_input_tokens_seen': 11401790, 'train_runtime': '5768', 'train_tokens_per_second': '1977'} +{'loss': '0.7077', 'grad_norm': '1.271', 'learning_rate': '4.999e-05', 'epoch': '0.1403', 'num_input_tokens_seen': 11403837, 'train_runtime': '5769', 'train_tokens_per_second': '1977'} +{'loss': '1.451', 'grad_norm': '1.956', 'learning_rate': '4.999e-05', 'epoch': '0.1403', 'num_input_tokens_seen': 11405884, 'train_runtime': '5770', 'train_tokens_per_second': '1977'} +{'loss': '2.435', 'grad_norm': '2.823', 'learning_rate': '4.999e-05', 'epoch': '0.1403', 'num_input_tokens_seen': 11407931, 'train_runtime': '5771', 'train_tokens_per_second': '1977'} +{'loss': '0.6048', 'grad_norm': '1.517', 'learning_rate': '4.999e-05', 'epoch': '0.1403', 'num_input_tokens_seen': 11409978, 'train_runtime': '5772', 'train_tokens_per_second': '1977'} +{'loss': '1.388', 'grad_norm': '2.035', 'learning_rate': '4.999e-05', 'epoch': '0.1404', 'num_input_tokens_seen': 11412025, 'train_runtime': '5773', 'train_tokens_per_second': '1977'} +{'loss': '0.6769', 'grad_norm': '1.327', 'learning_rate': '4.999e-05', 'epoch': '0.1404', 'num_input_tokens_seen': 11414072, 'train_runtime': '5774', 'train_tokens_per_second': '1977'} +{'loss': '0.4524', 'grad_norm': '1.027', 'learning_rate': '4.999e-05', 'epoch': '0.1404', 'num_input_tokens_seen': 11416119, 'train_runtime': '5775', 'train_tokens_per_second': '1977'} +{'loss': '2.211', 'grad_norm': '3.732', 'learning_rate': '4.999e-05', 'epoch': '0.1404', 'num_input_tokens_seen': 11418166, 'train_runtime': '5776', 'train_tokens_per_second': '1977'} +{'loss': '0.6512', 'grad_norm': '1.418', 'learning_rate': '4.999e-05', 'epoch': '0.1405', 'num_input_tokens_seen': 11420213, 'train_runtime': '5777', 'train_tokens_per_second': '1977'} +{'loss': '1.091', 'grad_norm': '1.485', 'learning_rate': '4.999e-05', 'epoch': '0.1405', 'num_input_tokens_seen': 11422260, 'train_runtime': '5778', 'train_tokens_per_second': '1977'} +{'loss': '0.4911', 'grad_norm': '1.14', 'learning_rate': '4.999e-05', 'epoch': '0.1405', 'num_input_tokens_seen': 11424307, 'train_runtime': '5779', 'train_tokens_per_second': '1977'} +{'loss': '0.7612', 'grad_norm': '2.12', 'learning_rate': '4.999e-05', 'epoch': '0.1405', 'num_input_tokens_seen': 11426354, 'train_runtime': '5780', 'train_tokens_per_second': '1977'} +{'loss': '0.7233', 'grad_norm': '1.35', 'learning_rate': '4.999e-05', 'epoch': '0.1406', 'num_input_tokens_seen': 11428401, 'train_runtime': '5781', 'train_tokens_per_second': '1977'} +{'loss': '0.6455', 'grad_norm': '1.898', 'learning_rate': '4.999e-05', 'epoch': '0.1406', 'num_input_tokens_seen': 11430448, 'train_runtime': '5782', 'train_tokens_per_second': '1977'} +{'loss': '2.06', 'grad_norm': '2.556', 'learning_rate': '4.999e-05', 'epoch': '0.1406', 'num_input_tokens_seen': 11432495, 'train_runtime': '5783', 'train_tokens_per_second': '1977'} +{'loss': '0.707', 'grad_norm': '1.383', 'learning_rate': '4.999e-05', 'epoch': '0.1406', 'num_input_tokens_seen': 11434542, 'train_runtime': '5784', 'train_tokens_per_second': '1977'} +{'loss': '0.5391', 'grad_norm': '1.143', 'learning_rate': '4.999e-05', 'epoch': '0.1407', 'num_input_tokens_seen': 11436589, 'train_runtime': '5785', 'train_tokens_per_second': '1977'} +{'loss': '1.138', 'grad_norm': '1.612', 'learning_rate': '4.999e-05', 'epoch': '0.1407', 'num_input_tokens_seen': 11438636, 'train_runtime': '5786', 'train_tokens_per_second': '1977'} +{'loss': '1.561', 'grad_norm': '2.251', 'learning_rate': '4.999e-05', 'epoch': '0.1407', 'num_input_tokens_seen': 11440683, 'train_runtime': '5787', 'train_tokens_per_second': '1977'} +{'loss': '0.2605', 'grad_norm': '1.058', 'learning_rate': '4.999e-05', 'epoch': '0.1407', 'num_input_tokens_seen': 11442730, 'train_runtime': '5788', 'train_tokens_per_second': '1977'} +{'loss': '0.3618', 'grad_norm': '1.112', 'learning_rate': '4.999e-05', 'epoch': '0.1408', 'num_input_tokens_seen': 11444777, 'train_runtime': '5789', 'train_tokens_per_second': '1977'} +{'loss': '1.797', 'grad_norm': '2.216', 'learning_rate': '4.999e-05', 'epoch': '0.1408', 'num_input_tokens_seen': 11446824, 'train_runtime': '5790', 'train_tokens_per_second': '1977'} +{'loss': '1.493', 'grad_norm': '2.208', 'learning_rate': '4.999e-05', 'epoch': '0.1408', 'num_input_tokens_seen': 11448871, 'train_runtime': '5791', 'train_tokens_per_second': '1977'} +{'loss': '2.191', 'grad_norm': '2.618', 'learning_rate': '4.999e-05', 'epoch': '0.1408', 'num_input_tokens_seen': 11450918, 'train_runtime': '5792', 'train_tokens_per_second': '1977'} +{'loss': '0.7051', 'grad_norm': '1.29', 'learning_rate': '4.999e-05', 'epoch': '0.1409', 'num_input_tokens_seen': 11452965, 'train_runtime': '5794', 'train_tokens_per_second': '1977'} +{'loss': '0.4686', 'grad_norm': '1.159', 'learning_rate': '4.999e-05', 'epoch': '0.1409', 'num_input_tokens_seen': 11455012, 'train_runtime': '5795', 'train_tokens_per_second': '1977'} +{'loss': '0.9324', 'grad_norm': '1.662', 'learning_rate': '4.999e-05', 'epoch': '0.1409', 'num_input_tokens_seen': 11457059, 'train_runtime': '5796', 'train_tokens_per_second': '1977'} +{'loss': '0.2664', 'grad_norm': '1.075', 'learning_rate': '4.999e-05', 'epoch': '0.1409', 'num_input_tokens_seen': 11459106, 'train_runtime': '5797', 'train_tokens_per_second': '1977'} +{'loss': '0.7676', 'grad_norm': '1.631', 'learning_rate': '4.999e-05', 'epoch': '0.141', 'num_input_tokens_seen': 11461153, 'train_runtime': '5798', 'train_tokens_per_second': '1977'} +{'loss': '2.002', 'grad_norm': '2.392', 'learning_rate': '4.999e-05', 'epoch': '0.141', 'num_input_tokens_seen': 11463200, 'train_runtime': '5799', 'train_tokens_per_second': '1977'} +{'loss': '0.445', 'grad_norm': '1.491', 'learning_rate': '4.999e-05', 'epoch': '0.141', 'num_input_tokens_seen': 11465247, 'train_runtime': '5800', 'train_tokens_per_second': '1977'} +{'loss': '0.807', 'grad_norm': '1.315', 'learning_rate': '4.999e-05', 'epoch': '0.141', 'num_input_tokens_seen': 11467294, 'train_runtime': '5801', 'train_tokens_per_second': '1977'} +{'loss': '1.16', 'grad_norm': '2.73', 'learning_rate': '4.999e-05', 'epoch': '0.1411', 'num_input_tokens_seen': 11469341, 'train_runtime': '5802', 'train_tokens_per_second': '1977'} +{'loss': '0.247', 'grad_norm': '0.9409', 'learning_rate': '4.999e-05', 'epoch': '0.1411', 'num_input_tokens_seen': 11471388, 'train_runtime': '5803', 'train_tokens_per_second': '1977'} +{'loss': '2.375', 'grad_norm': '2.779', 'learning_rate': '4.999e-05', 'epoch': '0.1411', 'num_input_tokens_seen': 11473435, 'train_runtime': '5804', 'train_tokens_per_second': '1977'} +{'loss': '0.5873', 'grad_norm': '1.608', 'learning_rate': '4.999e-05', 'epoch': '0.1411', 'num_input_tokens_seen': 11475482, 'train_runtime': '5805', 'train_tokens_per_second': '1977'} +{'loss': '0.2625', 'grad_norm': '1.053', 'learning_rate': '4.999e-05', 'epoch': '0.1412', 'num_input_tokens_seen': 11477529, 'train_runtime': '5806', 'train_tokens_per_second': '1977'} +{'loss': '0.6462', 'grad_norm': '1.6', 'learning_rate': '4.999e-05', 'epoch': '0.1412', 'num_input_tokens_seen': 11479576, 'train_runtime': '5807', 'train_tokens_per_second': '1977'} +{'loss': '0.5544', 'grad_norm': '1.244', 'learning_rate': '4.999e-05', 'epoch': '0.1412', 'num_input_tokens_seen': 11481623, 'train_runtime': '5808', 'train_tokens_per_second': '1977'} +{'loss': '0.7675', 'grad_norm': '1.457', 'learning_rate': '4.999e-05', 'epoch': '0.1412', 'num_input_tokens_seen': 11483670, 'train_runtime': '5809', 'train_tokens_per_second': '1977'} +{'loss': '1.04', 'grad_norm': '1.538', 'learning_rate': '4.999e-05', 'epoch': '0.1413', 'num_input_tokens_seen': 11485717, 'train_runtime': '5810', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '1.862', 'learning_rate': '4.999e-05', 'epoch': '0.1413', 'num_input_tokens_seen': 11487764, 'train_runtime': '5811', 'train_tokens_per_second': '1977'} +{'loss': '0.7458', 'grad_norm': '1.646', 'learning_rate': '4.999e-05', 'epoch': '0.1413', 'num_input_tokens_seen': 11489811, 'train_runtime': '5812', 'train_tokens_per_second': '1977'} +{'loss': '1.667', 'grad_norm': '2.122', 'learning_rate': '4.999e-05', 'epoch': '0.1414', 'num_input_tokens_seen': 11491858, 'train_runtime': '5813', 'train_tokens_per_second': '1977'} +{'loss': '1.51', 'grad_norm': '2.165', 'learning_rate': '4.999e-05', 'epoch': '0.1414', 'num_input_tokens_seen': 11493905, 'train_runtime': '5814', 'train_tokens_per_second': '1977'} +{'loss': '0.8852', 'grad_norm': '1.739', 'learning_rate': '4.999e-05', 'epoch': '0.1414', 'num_input_tokens_seen': 11495952, 'train_runtime': '5815', 'train_tokens_per_second': '1977'} +{'loss': '0.8769', 'grad_norm': '1.589', 'learning_rate': '4.999e-05', 'epoch': '0.1414', 'num_input_tokens_seen': 11497999, 'train_runtime': '5816', 'train_tokens_per_second': '1977'} +{'loss': '1.928', 'grad_norm': '2.589', 'learning_rate': '4.999e-05', 'epoch': '0.1415', 'num_input_tokens_seen': 11500046, 'train_runtime': '5817', 'train_tokens_per_second': '1977'} +{'loss': '1.55', 'grad_norm': '2.123', 'learning_rate': '4.999e-05', 'epoch': '0.1415', 'num_input_tokens_seen': 11502093, 'train_runtime': '5818', 'train_tokens_per_second': '1977'} +{'loss': '0.6395', 'grad_norm': '1.517', 'learning_rate': '4.999e-05', 'epoch': '0.1415', 'num_input_tokens_seen': 11504140, 'train_runtime': '5819', 'train_tokens_per_second': '1977'} +{'loss': '0.403', 'grad_norm': '1.159', 'learning_rate': '4.999e-05', 'epoch': '0.1415', 'num_input_tokens_seen': 11506187, 'train_runtime': '5820', 'train_tokens_per_second': '1977'} +{'loss': '0.5116', 'grad_norm': '1.12', 'learning_rate': '4.999e-05', 'epoch': '0.1416', 'num_input_tokens_seen': 11508234, 'train_runtime': '5821', 'train_tokens_per_second': '1977'} +{'loss': '0.2935', 'grad_norm': '1.025', 'learning_rate': '4.999e-05', 'epoch': '0.1416', 'num_input_tokens_seen': 11510281, 'train_runtime': '5822', 'train_tokens_per_second': '1977'} +{'loss': '1.304', 'grad_norm': '1.734', 'learning_rate': '4.999e-05', 'epoch': '0.1416', 'num_input_tokens_seen': 11512328, 'train_runtime': '5824', 'train_tokens_per_second': '1977'} +{'loss': '1.27', 'grad_norm': '1.987', 'learning_rate': '4.999e-05', 'epoch': '0.1416', 'num_input_tokens_seen': 11514375, 'train_runtime': '5825', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '1.652', 'learning_rate': '4.999e-05', 'epoch': '0.1417', 'num_input_tokens_seen': 11516422, 'train_runtime': '5826', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '1.921', 'learning_rate': '4.999e-05', 'epoch': '0.1417', 'num_input_tokens_seen': 11518469, 'train_runtime': '5827', 'train_tokens_per_second': '1977'} +{'loss': '0.8507', 'grad_norm': '1.377', 'learning_rate': '4.999e-05', 'epoch': '0.1417', 'num_input_tokens_seen': 11520516, 'train_runtime': '5828', 'train_tokens_per_second': '1977'} +{'loss': '0.3715', 'grad_norm': '1.133', 'learning_rate': '4.999e-05', 'epoch': '0.1417', 'num_input_tokens_seen': 11522563, 'train_runtime': '5829', 'train_tokens_per_second': '1977'} +{'loss': '0.705', 'grad_norm': '1.542', 'learning_rate': '4.999e-05', 'epoch': '0.1418', 'num_input_tokens_seen': 11524610, 'train_runtime': '5830', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '1.894', 'learning_rate': '4.999e-05', 'epoch': '0.1418', 'num_input_tokens_seen': 11526657, 'train_runtime': '5831', 'train_tokens_per_second': '1977'} +{'loss': '0.8739', 'grad_norm': '1.148', 'learning_rate': '4.999e-05', 'epoch': '0.1418', 'num_input_tokens_seen': 11528704, 'train_runtime': '5832', 'train_tokens_per_second': '1977'} +{'loss': '0.7238', 'grad_norm': '1.271', 'learning_rate': '4.999e-05', 'epoch': '0.1418', 'num_input_tokens_seen': 11530751, 'train_runtime': '5833', 'train_tokens_per_second': '1977'} +{'loss': '0.3918', 'grad_norm': '1.071', 'learning_rate': '4.999e-05', 'epoch': '0.1419', 'num_input_tokens_seen': 11532798, 'train_runtime': '5834', 'train_tokens_per_second': '1977'} +{'loss': '1.734', 'grad_norm': '2.337', 'learning_rate': '4.999e-05', 'epoch': '0.1419', 'num_input_tokens_seen': 11534845, 'train_runtime': '5835', 'train_tokens_per_second': '1977'} +{'loss': '0.3523', 'grad_norm': '1.031', 'learning_rate': '4.999e-05', 'epoch': '0.1419', 'num_input_tokens_seen': 11536892, 'train_runtime': '5836', 'train_tokens_per_second': '1977'} +{'loss': '0.3962', 'grad_norm': '1.446', 'learning_rate': '4.999e-05', 'epoch': '0.1419', 'num_input_tokens_seen': 11538939, 'train_runtime': '5837', 'train_tokens_per_second': '1977'} +{'loss': '1.039', 'grad_norm': '1.625', 'learning_rate': '4.999e-05', 'epoch': '0.142', 'num_input_tokens_seen': 11540986, 'train_runtime': '5838', 'train_tokens_per_second': '1977'} +{'loss': '0.5354', 'grad_norm': '1.29', 'learning_rate': '4.999e-05', 'epoch': '0.142', 'num_input_tokens_seen': 11543033, 'train_runtime': '5839', 'train_tokens_per_second': '1977'} +{'loss': '0.346', 'grad_norm': '0.9709', 'learning_rate': '4.999e-05', 'epoch': '0.142', 'num_input_tokens_seen': 11545080, 'train_runtime': '5840', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '1.686', 'learning_rate': '4.999e-05', 'epoch': '0.142', 'num_input_tokens_seen': 11547127, 'train_runtime': '5841', 'train_tokens_per_second': '1977'} +{'loss': '0.4224', 'grad_norm': '1.046', 'learning_rate': '4.999e-05', 'epoch': '0.1421', 'num_input_tokens_seen': 11549174, 'train_runtime': '5842', 'train_tokens_per_second': '1977'} +{'loss': '0.9756', 'grad_norm': '1.824', 'learning_rate': '4.999e-05', 'epoch': '0.1421', 'num_input_tokens_seen': 11551221, 'train_runtime': '5843', 'train_tokens_per_second': '1977'} +{'loss': '0.4082', 'grad_norm': '1.096', 'learning_rate': '4.999e-05', 'epoch': '0.1421', 'num_input_tokens_seen': 11553268, 'train_runtime': '5844', 'train_tokens_per_second': '1977'} +{'loss': '0.8957', 'grad_norm': '1.634', 'learning_rate': '4.999e-05', 'epoch': '0.1421', 'num_input_tokens_seen': 11555315, 'train_runtime': '5845', 'train_tokens_per_second': '1977'} +{'loss': '1.167', 'grad_norm': '1.666', 'learning_rate': '4.999e-05', 'epoch': '0.1422', 'num_input_tokens_seen': 11557362, 'train_runtime': '5846', 'train_tokens_per_second': '1977'} +{'loss': '1.333', 'grad_norm': '2.108', 'learning_rate': '4.999e-05', 'epoch': '0.1422', 'num_input_tokens_seen': 11559409, 'train_runtime': '5847', 'train_tokens_per_second': '1977'} +{'loss': '0.4276', 'grad_norm': '1.108', 'learning_rate': '4.999e-05', 'epoch': '0.1422', 'num_input_tokens_seen': 11561456, 'train_runtime': '5848', 'train_tokens_per_second': '1977'} +{'loss': '0.3083', 'grad_norm': '0.9291', 'learning_rate': '4.999e-05', 'epoch': '0.1422', 'num_input_tokens_seen': 11563503, 'train_runtime': '5849', 'train_tokens_per_second': '1977'} +{'loss': '0.7649', 'grad_norm': '1.734', 'learning_rate': '4.999e-05', 'epoch': '0.1423', 'num_input_tokens_seen': 11565550, 'train_runtime': '5850', 'train_tokens_per_second': '1977'} +{'loss': '1.054', 'grad_norm': '1.93', 'learning_rate': '4.999e-05', 'epoch': '0.1423', 'num_input_tokens_seen': 11567597, 'train_runtime': '5851', 'train_tokens_per_second': '1977'} +{'loss': '0.6063', 'grad_norm': '1.252', 'learning_rate': '4.999e-05', 'epoch': '0.1423', 'num_input_tokens_seen': 11569644, 'train_runtime': '5852', 'train_tokens_per_second': '1977'} +{'loss': '0.3265', 'grad_norm': '0.8792', 'learning_rate': '4.999e-05', 'epoch': '0.1423', 'num_input_tokens_seen': 11571691, 'train_runtime': '5854', 'train_tokens_per_second': '1977'} +{'loss': '1.542', 'grad_norm': '2.201', 'learning_rate': '4.999e-05', 'epoch': '0.1424', 'num_input_tokens_seen': 11573738, 'train_runtime': '5855', 'train_tokens_per_second': '1977'} +{'loss': '0.5285', 'grad_norm': '1.244', 'learning_rate': '4.999e-05', 'epoch': '0.1424', 'num_input_tokens_seen': 11575785, 'train_runtime': '5856', 'train_tokens_per_second': '1977'} +{'loss': '0.1698', 'grad_norm': '0.839', 'learning_rate': '4.999e-05', 'epoch': '0.1424', 'num_input_tokens_seen': 11577832, 'train_runtime': '5857', 'train_tokens_per_second': '1977'} +{'loss': '0.3176', 'grad_norm': '1.005', 'learning_rate': '4.999e-05', 'epoch': '0.1424', 'num_input_tokens_seen': 11579879, 'train_runtime': '5858', 'train_tokens_per_second': '1977'} +{'loss': '1.703', 'grad_norm': '2.068', 'learning_rate': '4.999e-05', 'epoch': '0.1425', 'num_input_tokens_seen': 11581926, 'train_runtime': '5859', 'train_tokens_per_second': '1977'} +{'loss': '0.8352', 'grad_norm': '1.441', 'learning_rate': '4.999e-05', 'epoch': '0.1425', 'num_input_tokens_seen': 11583973, 'train_runtime': '5860', 'train_tokens_per_second': '1977'} +{'loss': '0.5791', 'grad_norm': '1.121', 'learning_rate': '4.999e-05', 'epoch': '0.1425', 'num_input_tokens_seen': 11586020, 'train_runtime': '5861', 'train_tokens_per_second': '1977'} +{'loss': '0.9547', 'grad_norm': '1.511', 'learning_rate': '4.999e-05', 'epoch': '0.1425', 'num_input_tokens_seen': 11588067, 'train_runtime': '5862', 'train_tokens_per_second': '1977'} +{'loss': '0.5174', 'grad_norm': '1.206', 'learning_rate': '4.999e-05', 'epoch': '0.1426', 'num_input_tokens_seen': 11590114, 'train_runtime': '5863', 'train_tokens_per_second': '1977'} +{'loss': '0.3731', 'grad_norm': '0.9977', 'learning_rate': '4.999e-05', 'epoch': '0.1426', 'num_input_tokens_seen': 11592161, 'train_runtime': '5864', 'train_tokens_per_second': '1977'} +{'loss': '1.764', 'grad_norm': '2.229', 'learning_rate': '4.999e-05', 'epoch': '0.1426', 'num_input_tokens_seen': 11594208, 'train_runtime': '5865', 'train_tokens_per_second': '1977'} +{'loss': '0.8206', 'grad_norm': '1.643', 'learning_rate': '4.999e-05', 'epoch': '0.1426', 'num_input_tokens_seen': 11596255, 'train_runtime': '5866', 'train_tokens_per_second': '1977'} +{'loss': '0.4653', 'grad_norm': '1.548', 'learning_rate': '4.999e-05', 'epoch': '0.1427', 'num_input_tokens_seen': 11598302, 'train_runtime': '5867', 'train_tokens_per_second': '1977'} +{'loss': '0.9794', 'grad_norm': '1.637', 'learning_rate': '4.999e-05', 'epoch': '0.1427', 'num_input_tokens_seen': 11600349, 'train_runtime': '5868', 'train_tokens_per_second': '1977'} +{'loss': '0.9714', 'grad_norm': '1.915', 'learning_rate': '4.999e-05', 'epoch': '0.1427', 'num_input_tokens_seen': 11602396, 'train_runtime': '5869', 'train_tokens_per_second': '1977'} +{'loss': '0.3328', 'grad_norm': '1.243', 'learning_rate': '4.999e-05', 'epoch': '0.1427', 'num_input_tokens_seen': 11604443, 'train_runtime': '5870', 'train_tokens_per_second': '1977'} +{'loss': '0.464', 'grad_norm': '1.246', 'learning_rate': '4.999e-05', 'epoch': '0.1428', 'num_input_tokens_seen': 11606490, 'train_runtime': '5871', 'train_tokens_per_second': '1977'} +{'loss': '0.8294', 'grad_norm': '1.795', 'learning_rate': '4.999e-05', 'epoch': '0.1428', 'num_input_tokens_seen': 11608537, 'train_runtime': '5872', 'train_tokens_per_second': '1977'} +{'loss': '0.4745', 'grad_norm': '1.218', 'learning_rate': '4.999e-05', 'epoch': '0.1428', 'num_input_tokens_seen': 11610584, 'train_runtime': '5873', 'train_tokens_per_second': '1977'} +{'loss': '1.147', 'grad_norm': '1.719', 'learning_rate': '4.999e-05', 'epoch': '0.1428', 'num_input_tokens_seen': 11612631, 'train_runtime': '5874', 'train_tokens_per_second': '1977'} +{'loss': '0.4249', 'grad_norm': '1.36', 'learning_rate': '4.999e-05', 'epoch': '0.1429', 'num_input_tokens_seen': 11614678, 'train_runtime': '5875', 'train_tokens_per_second': '1977'} +{'loss': '0.4992', 'grad_norm': '1.52', 'learning_rate': '4.999e-05', 'epoch': '0.1429', 'num_input_tokens_seen': 11616725, 'train_runtime': '5876', 'train_tokens_per_second': '1977'} +{'loss': '0.5699', 'grad_norm': '1.316', 'learning_rate': '4.999e-05', 'epoch': '0.1429', 'num_input_tokens_seen': 11618772, 'train_runtime': '5877', 'train_tokens_per_second': '1977'} +{'loss': '1.037', 'grad_norm': '1.488', 'learning_rate': '4.999e-05', 'epoch': '0.1429', 'num_input_tokens_seen': 11620819, 'train_runtime': '5878', 'train_tokens_per_second': '1977'} +{'loss': '0.3353', 'grad_norm': '1.153', 'learning_rate': '4.999e-05', 'epoch': '0.143', 'num_input_tokens_seen': 11622866, 'train_runtime': '5879', 'train_tokens_per_second': '1977'} +{'loss': '0.848', 'grad_norm': '1.418', 'learning_rate': '4.999e-05', 'epoch': '0.143', 'num_input_tokens_seen': 11624913, 'train_runtime': '5880', 'train_tokens_per_second': '1977'} +{'loss': '0.6506', 'grad_norm': '1.512', 'learning_rate': '4.999e-05', 'epoch': '0.143', 'num_input_tokens_seen': 11626960, 'train_runtime': '5881', 'train_tokens_per_second': '1977'} +{'loss': '0.7914', 'grad_norm': '1.163', 'learning_rate': '4.999e-05', 'epoch': '0.143', 'num_input_tokens_seen': 11629007, 'train_runtime': '5882', 'train_tokens_per_second': '1977'} +{'loss': '0.8195', 'grad_norm': '1.687', 'learning_rate': '4.999e-05', 'epoch': '0.1431', 'num_input_tokens_seen': 11631054, 'train_runtime': '5884', 'train_tokens_per_second': '1977'} +{'loss': '0.3558', 'grad_norm': '1.145', 'learning_rate': '4.999e-05', 'epoch': '0.1431', 'num_input_tokens_seen': 11633101, 'train_runtime': '5885', 'train_tokens_per_second': '1977'} +{'loss': '0.7054', 'grad_norm': '1.478', 'learning_rate': '4.999e-05', 'epoch': '0.1431', 'num_input_tokens_seen': 11635148, 'train_runtime': '5886', 'train_tokens_per_second': '1977'} +{'loss': '0.3512', 'grad_norm': '1.027', 'learning_rate': '4.999e-05', 'epoch': '0.1431', 'num_input_tokens_seen': 11637195, 'train_runtime': '5887', 'train_tokens_per_second': '1977'} +{'loss': '0.3725', 'grad_norm': '0.9464', 'learning_rate': '4.999e-05', 'epoch': '0.1432', 'num_input_tokens_seen': 11639242, 'train_runtime': '5888', 'train_tokens_per_second': '1977'} +{'loss': '1.419', 'grad_norm': '2.471', 'learning_rate': '4.999e-05', 'epoch': '0.1432', 'num_input_tokens_seen': 11641289, 'train_runtime': '5889', 'train_tokens_per_second': '1977'} +{'loss': '0.6365', 'grad_norm': '1.548', 'learning_rate': '4.999e-05', 'epoch': '0.1432', 'num_input_tokens_seen': 11643336, 'train_runtime': '5890', 'train_tokens_per_second': '1977'} +{'loss': '0.4674', 'grad_norm': '1.254', 'learning_rate': '4.999e-05', 'epoch': '0.1432', 'num_input_tokens_seen': 11645383, 'train_runtime': '5891', 'train_tokens_per_second': '1977'} +{'loss': '0.7651', 'grad_norm': '1.664', 'learning_rate': '4.999e-05', 'epoch': '0.1433', 'num_input_tokens_seen': 11647430, 'train_runtime': '5892', 'train_tokens_per_second': '1977'} +{'loss': '0.9294', 'grad_norm': '1.461', 'learning_rate': '4.999e-05', 'epoch': '0.1433', 'num_input_tokens_seen': 11649477, 'train_runtime': '5893', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '2.242', 'learning_rate': '4.999e-05', 'epoch': '0.1433', 'num_input_tokens_seen': 11651524, 'train_runtime': '5894', 'train_tokens_per_second': '1977'} +{'loss': '1.541', 'grad_norm': '2.462', 'learning_rate': '4.999e-05', 'epoch': '0.1433', 'num_input_tokens_seen': 11653571, 'train_runtime': '5895', 'train_tokens_per_second': '1977'} +{'loss': '0.2915', 'grad_norm': '1.172', 'learning_rate': '4.999e-05', 'epoch': '0.1434', 'num_input_tokens_seen': 11655618, 'train_runtime': '5896', 'train_tokens_per_second': '1977'} +{'loss': '0.29', 'grad_norm': '0.98', 'learning_rate': '4.999e-05', 'epoch': '0.1434', 'num_input_tokens_seen': 11657665, 'train_runtime': '5897', 'train_tokens_per_second': '1977'} +{'loss': '0.3712', 'grad_norm': '0.9656', 'learning_rate': '4.999e-05', 'epoch': '0.1434', 'num_input_tokens_seen': 11659712, 'train_runtime': '5898', 'train_tokens_per_second': '1977'} +{'loss': '0.8543', 'grad_norm': '1.529', 'learning_rate': '4.999e-05', 'epoch': '0.1434', 'num_input_tokens_seen': 11661759, 'train_runtime': '5899', 'train_tokens_per_second': '1977'} +{'loss': '0.8873', 'grad_norm': '1.639', 'learning_rate': '4.999e-05', 'epoch': '0.1435', 'num_input_tokens_seen': 11663806, 'train_runtime': '5900', 'train_tokens_per_second': '1977'} +{'loss': '1.829', 'grad_norm': '2.53', 'learning_rate': '4.999e-05', 'epoch': '0.1435', 'num_input_tokens_seen': 11665853, 'train_runtime': '5901', 'train_tokens_per_second': '1977'} +{'loss': '1.416', 'grad_norm': '2.322', 'learning_rate': '4.999e-05', 'epoch': '0.1435', 'num_input_tokens_seen': 11667900, 'train_runtime': '5902', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '2.075', 'learning_rate': '4.999e-05', 'epoch': '0.1435', 'num_input_tokens_seen': 11669947, 'train_runtime': '5903', 'train_tokens_per_second': '1977'} +{'loss': '1.485', 'grad_norm': '2.339', 'learning_rate': '4.999e-05', 'epoch': '0.1436', 'num_input_tokens_seen': 11671994, 'train_runtime': '5904', 'train_tokens_per_second': '1977'} +{'loss': '0.8513', 'grad_norm': '1.295', 'learning_rate': '4.999e-05', 'epoch': '0.1436', 'num_input_tokens_seen': 11674041, 'train_runtime': '5905', 'train_tokens_per_second': '1977'} +{'loss': '0.3642', 'grad_norm': '0.9911', 'learning_rate': '4.999e-05', 'epoch': '0.1436', 'num_input_tokens_seen': 11676088, 'train_runtime': '5906', 'train_tokens_per_second': '1977'} +{'loss': '1.111', 'grad_norm': '1.515', 'learning_rate': '4.999e-05', 'epoch': '0.1436', 'num_input_tokens_seen': 11678135, 'train_runtime': '5907', 'train_tokens_per_second': '1977'} +{'loss': '0.3899', 'grad_norm': '1.153', 'learning_rate': '4.999e-05', 'epoch': '0.1437', 'num_input_tokens_seen': 11680182, 'train_runtime': '5908', 'train_tokens_per_second': '1977'} +{'loss': '1.652', 'grad_norm': '2.247', 'learning_rate': '4.999e-05', 'epoch': '0.1437', 'num_input_tokens_seen': 11682229, 'train_runtime': '5909', 'train_tokens_per_second': '1977'} +{'loss': '0.939', 'grad_norm': '1.338', 'learning_rate': '4.999e-05', 'epoch': '0.1437', 'num_input_tokens_seen': 11684276, 'train_runtime': '5910', 'train_tokens_per_second': '1977'} +{'loss': '0.8974', 'grad_norm': '1.566', 'learning_rate': '4.999e-05', 'epoch': '0.1437', 'num_input_tokens_seen': 11686323, 'train_runtime': '5911', 'train_tokens_per_second': '1977'} +{'loss': '0.6061', 'grad_norm': '1.309', 'learning_rate': '4.999e-05', 'epoch': '0.1438', 'num_input_tokens_seen': 11688370, 'train_runtime': '5912', 'train_tokens_per_second': '1977'} +{'loss': '1.028', 'grad_norm': '2.259', 'learning_rate': '4.999e-05', 'epoch': '0.1438', 'num_input_tokens_seen': 11690417, 'train_runtime': '5914', 'train_tokens_per_second': '1977'} +{'loss': '0.7721', 'grad_norm': '1.575', 'learning_rate': '4.999e-05', 'epoch': '0.1438', 'num_input_tokens_seen': 11692464, 'train_runtime': '5915', 'train_tokens_per_second': '1977'} +{'loss': '0.8021', 'grad_norm': '1.514', 'learning_rate': '4.999e-05', 'epoch': '0.1438', 'num_input_tokens_seen': 11694511, 'train_runtime': '5916', 'train_tokens_per_second': '1977'} +{'loss': '1.59', 'grad_norm': '2.208', 'learning_rate': '4.999e-05', 'epoch': '0.1439', 'num_input_tokens_seen': 11696558, 'train_runtime': '5917', 'train_tokens_per_second': '1977'} +{'loss': '0.5588', 'grad_norm': '1.555', 'learning_rate': '4.999e-05', 'epoch': '0.1439', 'num_input_tokens_seen': 11698605, 'train_runtime': '5918', 'train_tokens_per_second': '1977'} +{'loss': '0.753', 'grad_norm': '1.439', 'learning_rate': '4.999e-05', 'epoch': '0.1439', 'num_input_tokens_seen': 11700652, 'train_runtime': '5919', 'train_tokens_per_second': '1977'} +{'loss': '0.5358', 'grad_norm': '1.206', 'learning_rate': '4.999e-05', 'epoch': '0.1439', 'num_input_tokens_seen': 11702699, 'train_runtime': '5920', 'train_tokens_per_second': '1977'} +{'loss': '1.294', 'grad_norm': '1.92', 'learning_rate': '4.999e-05', 'epoch': '0.144', 'num_input_tokens_seen': 11704746, 'train_runtime': '5921', 'train_tokens_per_second': '1977'} +{'loss': '0.3944', 'grad_norm': '1.42', 'learning_rate': '4.999e-05', 'epoch': '0.144', 'num_input_tokens_seen': 11706793, 'train_runtime': '5922', 'train_tokens_per_second': '1977'} +{'loss': '0.8244', 'grad_norm': '1.334', 'learning_rate': '4.999e-05', 'epoch': '0.144', 'num_input_tokens_seen': 11708840, 'train_runtime': '5923', 'train_tokens_per_second': '1977'} +{'loss': '1.328', 'grad_norm': '2.28', 'learning_rate': '4.999e-05', 'epoch': '0.144', 'num_input_tokens_seen': 11710887, 'train_runtime': '5924', 'train_tokens_per_second': '1977'} +{'loss': '0.3768', 'grad_norm': '1.296', 'learning_rate': '4.999e-05', 'epoch': '0.1441', 'num_input_tokens_seen': 11712934, 'train_runtime': '5925', 'train_tokens_per_second': '1977'} +{'loss': '0.4385', 'grad_norm': '1.123', 'learning_rate': '4.999e-05', 'epoch': '0.1441', 'num_input_tokens_seen': 11714981, 'train_runtime': '5926', 'train_tokens_per_second': '1977'} +{'loss': '0.6982', 'grad_norm': '1.406', 'learning_rate': '4.999e-05', 'epoch': '0.1441', 'num_input_tokens_seen': 11717028, 'train_runtime': '5927', 'train_tokens_per_second': '1977'} +{'loss': '1.097', 'grad_norm': '2.118', 'learning_rate': '4.999e-05', 'epoch': '0.1441', 'num_input_tokens_seen': 11719075, 'train_runtime': '5928', 'train_tokens_per_second': '1977'} +{'loss': '0.9337', 'grad_norm': '1.564', 'learning_rate': '4.999e-05', 'epoch': '0.1442', 'num_input_tokens_seen': 11721122, 'train_runtime': '5929', 'train_tokens_per_second': '1977'} +{'loss': '0.85', 'grad_norm': '1.549', 'learning_rate': '4.999e-05', 'epoch': '0.1442', 'num_input_tokens_seen': 11723169, 'train_runtime': '5930', 'train_tokens_per_second': '1977'} +{'loss': '0.3482', 'grad_norm': '1.307', 'learning_rate': '4.999e-05', 'epoch': '0.1442', 'num_input_tokens_seen': 11725216, 'train_runtime': '5931', 'train_tokens_per_second': '1977'} +{'loss': '0.4527', 'grad_norm': '1.187', 'learning_rate': '4.999e-05', 'epoch': '0.1442', 'num_input_tokens_seen': 11727263, 'train_runtime': '5932', 'train_tokens_per_second': '1977'} +{'loss': '0.2843', 'grad_norm': '1.076', 'learning_rate': '4.999e-05', 'epoch': '0.1443', 'num_input_tokens_seen': 11729310, 'train_runtime': '5933', 'train_tokens_per_second': '1977'} +{'loss': '0.4059', 'grad_norm': '1.341', 'learning_rate': '4.999e-05', 'epoch': '0.1443', 'num_input_tokens_seen': 11731357, 'train_runtime': '5934', 'train_tokens_per_second': '1977'} +{'loss': '0.8917', 'grad_norm': '1.241', 'learning_rate': '4.999e-05', 'epoch': '0.1443', 'num_input_tokens_seen': 11733404, 'train_runtime': '5935', 'train_tokens_per_second': '1977'} +{'loss': '0.5019', 'grad_norm': '1.224', 'learning_rate': '4.999e-05', 'epoch': '0.1443', 'num_input_tokens_seen': 11735451, 'train_runtime': '5936', 'train_tokens_per_second': '1977'} +{'loss': '0.9761', 'grad_norm': '1.786', 'learning_rate': '4.999e-05', 'epoch': '0.1444', 'num_input_tokens_seen': 11737498, 'train_runtime': '5937', 'train_tokens_per_second': '1977'} +{'loss': '0.6021', 'grad_norm': '1.501', 'learning_rate': '4.999e-05', 'epoch': '0.1444', 'num_input_tokens_seen': 11739545, 'train_runtime': '5938', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '2.152', 'learning_rate': '4.999e-05', 'epoch': '0.1444', 'num_input_tokens_seen': 11741592, 'train_runtime': '5939', 'train_tokens_per_second': '1977'} +{'loss': '0.554', 'grad_norm': '1.357', 'learning_rate': '4.999e-05', 'epoch': '0.1444', 'num_input_tokens_seen': 11743639, 'train_runtime': '5940', 'train_tokens_per_second': '1977'} +{'loss': '1.382', 'grad_norm': '2.332', 'learning_rate': '4.999e-05', 'epoch': '0.1445', 'num_input_tokens_seen': 11745686, 'train_runtime': '5941', 'train_tokens_per_second': '1977'} +{'loss': '1.618', 'grad_norm': '2.191', 'learning_rate': '4.999e-05', 'epoch': '0.1445', 'num_input_tokens_seen': 11747733, 'train_runtime': '5942', 'train_tokens_per_second': '1977'} +{'loss': '0.6571', 'grad_norm': '1.481', 'learning_rate': '4.999e-05', 'epoch': '0.1445', 'num_input_tokens_seen': 11749780, 'train_runtime': '5944', 'train_tokens_per_second': '1977'} +{'loss': '0.4114', 'grad_norm': '1.132', 'learning_rate': '4.999e-05', 'epoch': '0.1445', 'num_input_tokens_seen': 11751827, 'train_runtime': '5945', 'train_tokens_per_second': '1977'} +{'loss': '0.3865', 'grad_norm': '1.422', 'learning_rate': '4.999e-05', 'epoch': '0.1446', 'num_input_tokens_seen': 11753874, 'train_runtime': '5946', 'train_tokens_per_second': '1977'} +{'loss': '1.977', 'grad_norm': '2.506', 'learning_rate': '4.999e-05', 'epoch': '0.1446', 'num_input_tokens_seen': 11755921, 'train_runtime': '5947', 'train_tokens_per_second': '1977'} +{'loss': '1.908', 'grad_norm': '2.244', 'learning_rate': '4.999e-05', 'epoch': '0.1446', 'num_input_tokens_seen': 11757968, 'train_runtime': '5948', 'train_tokens_per_second': '1977'} +{'loss': '0.449', 'grad_norm': '1.342', 'learning_rate': '4.999e-05', 'epoch': '0.1446', 'num_input_tokens_seen': 11760015, 'train_runtime': '5949', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '1.708', 'learning_rate': '4.999e-05', 'epoch': '0.1447', 'num_input_tokens_seen': 11762062, 'train_runtime': '5950', 'train_tokens_per_second': '1977'} +{'loss': '1.14', 'grad_norm': '1.92', 'learning_rate': '4.999e-05', 'epoch': '0.1447', 'num_input_tokens_seen': 11764109, 'train_runtime': '5951', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '2.085', 'learning_rate': '4.999e-05', 'epoch': '0.1447', 'num_input_tokens_seen': 11766156, 'train_runtime': '5952', 'train_tokens_per_second': '1977'} +{'loss': '0.4112', 'grad_norm': '1.072', 'learning_rate': '4.999e-05', 'epoch': '0.1447', 'num_input_tokens_seen': 11768203, 'train_runtime': '5953', 'train_tokens_per_second': '1977'} +{'loss': '0.3132', 'grad_norm': '1.169', 'learning_rate': '4.999e-05', 'epoch': '0.1448', 'num_input_tokens_seen': 11770250, 'train_runtime': '5954', 'train_tokens_per_second': '1977'} +{'loss': '0.9367', 'grad_norm': '1.493', 'learning_rate': '4.999e-05', 'epoch': '0.1448', 'num_input_tokens_seen': 11772297, 'train_runtime': '5955', 'train_tokens_per_second': '1977'} +{'loss': '1.1', 'grad_norm': '1.545', 'learning_rate': '4.999e-05', 'epoch': '0.1448', 'num_input_tokens_seen': 11774344, 'train_runtime': '5956', 'train_tokens_per_second': '1977'} +{'loss': '0.8105', 'grad_norm': '1.415', 'learning_rate': '4.999e-05', 'epoch': '0.1448', 'num_input_tokens_seen': 11776391, 'train_runtime': '5957', 'train_tokens_per_second': '1977'} +{'loss': '0.7521', 'grad_norm': '1.172', 'learning_rate': '4.999e-05', 'epoch': '0.1449', 'num_input_tokens_seen': 11778438, 'train_runtime': '5958', 'train_tokens_per_second': '1977'} +{'loss': '0.6955', 'grad_norm': '1.789', 'learning_rate': '4.999e-05', 'epoch': '0.1449', 'num_input_tokens_seen': 11780485, 'train_runtime': '5959', 'train_tokens_per_second': '1977'} +{'loss': '0.6977', 'grad_norm': '1.353', 'learning_rate': '4.999e-05', 'epoch': '0.1449', 'num_input_tokens_seen': 11782532, 'train_runtime': '5960', 'train_tokens_per_second': '1977'} +{'loss': '1.475', 'grad_norm': '2.617', 'learning_rate': '4.999e-05', 'epoch': '0.145', 'num_input_tokens_seen': 11784579, 'train_runtime': '5961', 'train_tokens_per_second': '1977'} +{'loss': '2.179', 'grad_norm': '2.424', 'learning_rate': '4.999e-05', 'epoch': '0.145', 'num_input_tokens_seen': 11786626, 'train_runtime': '5962', 'train_tokens_per_second': '1977'} +{'loss': '0.37', 'grad_norm': '1.099', 'learning_rate': '4.999e-05', 'epoch': '0.145', 'num_input_tokens_seen': 11788673, 'train_runtime': '5963', 'train_tokens_per_second': '1977'} +{'loss': '2.119', 'grad_norm': '2.992', 'learning_rate': '4.999e-05', 'epoch': '0.145', 'num_input_tokens_seen': 11790720, 'train_runtime': '5964', 'train_tokens_per_second': '1977'} +{'loss': '0.78', 'grad_norm': '1.637', 'learning_rate': '4.999e-05', 'epoch': '0.1451', 'num_input_tokens_seen': 11792767, 'train_runtime': '5965', 'train_tokens_per_second': '1977'} +{'loss': '0.6369', 'grad_norm': '1.588', 'learning_rate': '4.999e-05', 'epoch': '0.1451', 'num_input_tokens_seen': 11794814, 'train_runtime': '5966', 'train_tokens_per_second': '1977'} +{'loss': '0.4828', 'grad_norm': '1.102', 'learning_rate': '4.999e-05', 'epoch': '0.1451', 'num_input_tokens_seen': 11796861, 'train_runtime': '5967', 'train_tokens_per_second': '1977'} +{'loss': '0.6493', 'grad_norm': '1.463', 'learning_rate': '4.999e-05', 'epoch': '0.1451', 'num_input_tokens_seen': 11798908, 'train_runtime': '5968', 'train_tokens_per_second': '1977'} +{'loss': '0.8492', 'grad_norm': '1.74', 'learning_rate': '4.999e-05', 'epoch': '0.1452', 'num_input_tokens_seen': 11800955, 'train_runtime': '5969', 'train_tokens_per_second': '1977'} +{'loss': '2.389', 'grad_norm': '2.499', 'learning_rate': '4.999e-05', 'epoch': '0.1452', 'num_input_tokens_seen': 11803002, 'train_runtime': '5970', 'train_tokens_per_second': '1977'} +{'loss': '1.768', 'grad_norm': '2.956', 'learning_rate': '4.999e-05', 'epoch': '0.1452', 'num_input_tokens_seen': 11805049, 'train_runtime': '5971', 'train_tokens_per_second': '1977'} +{'loss': '2.011', 'grad_norm': '2.31', 'learning_rate': '4.999e-05', 'epoch': '0.1452', 'num_input_tokens_seen': 11807096, 'train_runtime': '5972', 'train_tokens_per_second': '1977'} +{'loss': '0.3457', 'grad_norm': '1.113', 'learning_rate': '4.999e-05', 'epoch': '0.1453', 'num_input_tokens_seen': 11809143, 'train_runtime': '5974', 'train_tokens_per_second': '1977'} +{'loss': '1.164', 'grad_norm': '1.552', 'learning_rate': '4.999e-05', 'epoch': '0.1453', 'num_input_tokens_seen': 11811190, 'train_runtime': '5975', 'train_tokens_per_second': '1977'} +{'loss': '0.7556', 'grad_norm': '1.498', 'learning_rate': '4.999e-05', 'epoch': '0.1453', 'num_input_tokens_seen': 11813237, 'train_runtime': '5976', 'train_tokens_per_second': '1977'} +{'loss': '0.5506', 'grad_norm': '1.268', 'learning_rate': '4.999e-05', 'epoch': '0.1453', 'num_input_tokens_seen': 11815284, 'train_runtime': '5977', 'train_tokens_per_second': '1977'} +{'loss': '1.319', 'grad_norm': '2.498', 'learning_rate': '4.999e-05', 'epoch': '0.1454', 'num_input_tokens_seen': 11817331, 'train_runtime': '5978', 'train_tokens_per_second': '1977'} +{'loss': '0.3872', 'grad_norm': '0.9859', 'learning_rate': '4.999e-05', 'epoch': '0.1454', 'num_input_tokens_seen': 11819378, 'train_runtime': '5979', 'train_tokens_per_second': '1977'} +{'loss': '0.4423', 'grad_norm': '1.206', 'learning_rate': '4.999e-05', 'epoch': '0.1454', 'num_input_tokens_seen': 11821425, 'train_runtime': '5980', 'train_tokens_per_second': '1977'} +{'loss': '1.138', 'grad_norm': '1.699', 'learning_rate': '4.999e-05', 'epoch': '0.1454', 'num_input_tokens_seen': 11823472, 'train_runtime': '5981', 'train_tokens_per_second': '1977'} +{'loss': '0.9262', 'grad_norm': '1.525', 'learning_rate': '4.999e-05', 'epoch': '0.1455', 'num_input_tokens_seen': 11825519, 'train_runtime': '5982', 'train_tokens_per_second': '1977'} +{'loss': '0.6256', 'grad_norm': '1.373', 'learning_rate': '4.999e-05', 'epoch': '0.1455', 'num_input_tokens_seen': 11827566, 'train_runtime': '5983', 'train_tokens_per_second': '1977'} +{'loss': '0.4803', 'grad_norm': '1.179', 'learning_rate': '4.999e-05', 'epoch': '0.1455', 'num_input_tokens_seen': 11829613, 'train_runtime': '5984', 'train_tokens_per_second': '1977'} +{'loss': '1.26', 'grad_norm': '2.091', 'learning_rate': '4.999e-05', 'epoch': '0.1455', 'num_input_tokens_seen': 11831660, 'train_runtime': '5985', 'train_tokens_per_second': '1977'} +{'loss': '0.8754', 'grad_norm': '1.773', 'learning_rate': '4.999e-05', 'epoch': '0.1456', 'num_input_tokens_seen': 11833707, 'train_runtime': '5986', 'train_tokens_per_second': '1977'} +{'loss': '1.35', 'grad_norm': '2.009', 'learning_rate': '4.999e-05', 'epoch': '0.1456', 'num_input_tokens_seen': 11835754, 'train_runtime': '5987', 'train_tokens_per_second': '1977'} +{'loss': '0.8833', 'grad_norm': '1.279', 'learning_rate': '4.999e-05', 'epoch': '0.1456', 'num_input_tokens_seen': 11837801, 'train_runtime': '5988', 'train_tokens_per_second': '1977'} +{'loss': '2.808', 'grad_norm': '2.374', 'learning_rate': '4.999e-05', 'epoch': '0.1456', 'num_input_tokens_seen': 11839848, 'train_runtime': '5989', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '1.461', 'learning_rate': '4.999e-05', 'epoch': '0.1457', 'num_input_tokens_seen': 11841895, 'train_runtime': '5990', 'train_tokens_per_second': '1977'} +{'loss': '0.9949', 'grad_norm': '1.682', 'learning_rate': '4.999e-05', 'epoch': '0.1457', 'num_input_tokens_seen': 11843942, 'train_runtime': '5991', 'train_tokens_per_second': '1977'} +{'loss': '0.8337', 'grad_norm': '1.415', 'learning_rate': '4.999e-05', 'epoch': '0.1457', 'num_input_tokens_seen': 11845989, 'train_runtime': '5992', 'train_tokens_per_second': '1977'} +{'loss': '0.4591', 'grad_norm': '1.044', 'learning_rate': '4.999e-05', 'epoch': '0.1457', 'num_input_tokens_seen': 11848036, 'train_runtime': '5993', 'train_tokens_per_second': '1977'} +{'loss': '0.6552', 'grad_norm': '1.457', 'learning_rate': '4.999e-05', 'epoch': '0.1458', 'num_input_tokens_seen': 11850083, 'train_runtime': '5994', 'train_tokens_per_second': '1977'} +{'loss': '0.7365', 'grad_norm': '1.453', 'learning_rate': '4.999e-05', 'epoch': '0.1458', 'num_input_tokens_seen': 11852130, 'train_runtime': '5995', 'train_tokens_per_second': '1977'} +{'loss': '0.3905', 'grad_norm': '0.9452', 'learning_rate': '4.999e-05', 'epoch': '0.1458', 'num_input_tokens_seen': 11854177, 'train_runtime': '5996', 'train_tokens_per_second': '1977'} +{'loss': '1.118', 'grad_norm': '1.722', 'learning_rate': '4.999e-05', 'epoch': '0.1458', 'num_input_tokens_seen': 11856224, 'train_runtime': '5997', 'train_tokens_per_second': '1977'} +{'loss': '1.136', 'grad_norm': '1.963', 'learning_rate': '4.999e-05', 'epoch': '0.1459', 'num_input_tokens_seen': 11858271, 'train_runtime': '5998', 'train_tokens_per_second': '1977'} +{'loss': '0.3988', 'grad_norm': '1.156', 'learning_rate': '4.999e-05', 'epoch': '0.1459', 'num_input_tokens_seen': 11860318, 'train_runtime': '5999', 'train_tokens_per_second': '1977'} +{'loss': '0.8474', 'grad_norm': '1.533', 'learning_rate': '4.999e-05', 'epoch': '0.1459', 'num_input_tokens_seen': 11862365, 'train_runtime': '6000', 'train_tokens_per_second': '1977'} +{'loss': '0.5126', 'grad_norm': '1.326', 'learning_rate': '4.999e-05', 'epoch': '0.1459', 'num_input_tokens_seen': 11864412, 'train_runtime': '6001', 'train_tokens_per_second': '1977'} +{'loss': '0.6037', 'grad_norm': '1.236', 'learning_rate': '4.999e-05', 'epoch': '0.146', 'num_input_tokens_seen': 11866459, 'train_runtime': '6003', 'train_tokens_per_second': '1977'} +{'loss': '0.4021', 'grad_norm': '0.9425', 'learning_rate': '4.999e-05', 'epoch': '0.146', 'num_input_tokens_seen': 11868506, 'train_runtime': '6004', 'train_tokens_per_second': '1977'} +{'loss': '0.981', 'grad_norm': '1.436', 'learning_rate': '4.999e-05', 'epoch': '0.146', 'num_input_tokens_seen': 11870553, 'train_runtime': '6005', 'train_tokens_per_second': '1977'} +{'loss': '2.114', 'grad_norm': '2.632', 'learning_rate': '4.999e-05', 'epoch': '0.146', 'num_input_tokens_seen': 11872600, 'train_runtime': '6006', 'train_tokens_per_second': '1977'} +{'loss': '0.8396', 'grad_norm': '1.513', 'learning_rate': '4.999e-05', 'epoch': '0.1461', 'num_input_tokens_seen': 11874647, 'train_runtime': '6007', 'train_tokens_per_second': '1977'} +{'loss': '1.904', 'grad_norm': '2.593', 'learning_rate': '4.999e-05', 'epoch': '0.1461', 'num_input_tokens_seen': 11876694, 'train_runtime': '6008', 'train_tokens_per_second': '1977'} +{'loss': '1.501', 'grad_norm': '1.955', 'learning_rate': '4.999e-05', 'epoch': '0.1461', 'num_input_tokens_seen': 11878741, 'train_runtime': '6009', 'train_tokens_per_second': '1977'} +{'loss': '0.3283', 'grad_norm': '1.034', 'learning_rate': '4.999e-05', 'epoch': '0.1461', 'num_input_tokens_seen': 11880788, 'train_runtime': '6010', 'train_tokens_per_second': '1977'} +{'loss': '0.4999', 'grad_norm': '1.117', 'learning_rate': '4.999e-05', 'epoch': '0.1462', 'num_input_tokens_seen': 11882835, 'train_runtime': '6011', 'train_tokens_per_second': '1977'} +{'loss': '1.245', 'grad_norm': '2.205', 'learning_rate': '4.999e-05', 'epoch': '0.1462', 'num_input_tokens_seen': 11884882, 'train_runtime': '6012', 'train_tokens_per_second': '1977'} +{'loss': '0.7397', 'grad_norm': '1.752', 'learning_rate': '4.999e-05', 'epoch': '0.1462', 'num_input_tokens_seen': 11886929, 'train_runtime': '6013', 'train_tokens_per_second': '1977'} +{'loss': '0.4813', 'grad_norm': '1.29', 'learning_rate': '4.999e-05', 'epoch': '0.1462', 'num_input_tokens_seen': 11888976, 'train_runtime': '6014', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '1.646', 'learning_rate': '4.999e-05', 'epoch': '0.1463', 'num_input_tokens_seen': 11891023, 'train_runtime': '6015', 'train_tokens_per_second': '1977'} +{'loss': '0.7535', 'grad_norm': '1.711', 'learning_rate': '4.999e-05', 'epoch': '0.1463', 'num_input_tokens_seen': 11893070, 'train_runtime': '6016', 'train_tokens_per_second': '1977'} +{'loss': '0.3126', 'grad_norm': '1.145', 'learning_rate': '4.999e-05', 'epoch': '0.1463', 'num_input_tokens_seen': 11895117, 'train_runtime': '6017', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '2.175', 'learning_rate': '4.999e-05', 'epoch': '0.1463', 'num_input_tokens_seen': 11897164, 'train_runtime': '6018', 'train_tokens_per_second': '1977'} +{'loss': '0.5489', 'grad_norm': '1.232', 'learning_rate': '4.999e-05', 'epoch': '0.1464', 'num_input_tokens_seen': 11899211, 'train_runtime': '6019', 'train_tokens_per_second': '1977'} +{'loss': '1.761', 'grad_norm': '2.396', 'learning_rate': '4.999e-05', 'epoch': '0.1464', 'num_input_tokens_seen': 11901258, 'train_runtime': '6020', 'train_tokens_per_second': '1977'} +{'loss': '0.8611', 'grad_norm': '1.629', 'learning_rate': '4.999e-05', 'epoch': '0.1464', 'num_input_tokens_seen': 11903305, 'train_runtime': '6021', 'train_tokens_per_second': '1977'} +{'loss': '1.127', 'grad_norm': '1.811', 'learning_rate': '4.999e-05', 'epoch': '0.1464', 'num_input_tokens_seen': 11905352, 'train_runtime': '6022', 'train_tokens_per_second': '1977'} +{'loss': '0.3055', 'grad_norm': '1.112', 'learning_rate': '4.999e-05', 'epoch': '0.1465', 'num_input_tokens_seen': 11907399, 'train_runtime': '6023', 'train_tokens_per_second': '1977'} +{'loss': '0.8763', 'grad_norm': '1.319', 'learning_rate': '4.999e-05', 'epoch': '0.1465', 'num_input_tokens_seen': 11909446, 'train_runtime': '6024', 'train_tokens_per_second': '1977'} +{'loss': '0.6047', 'grad_norm': '1.598', 'learning_rate': '4.999e-05', 'epoch': '0.1465', 'num_input_tokens_seen': 11911493, 'train_runtime': '6025', 'train_tokens_per_second': '1977'} +{'loss': '0.3958', 'grad_norm': '1.481', 'learning_rate': '4.999e-05', 'epoch': '0.1465', 'num_input_tokens_seen': 11913540, 'train_runtime': '6026', 'train_tokens_per_second': '1977'} +{'loss': '0.573', 'grad_norm': '1.318', 'learning_rate': '4.999e-05', 'epoch': '0.1466', 'num_input_tokens_seen': 11915587, 'train_runtime': '6027', 'train_tokens_per_second': '1977'} +{'loss': '0.6512', 'grad_norm': '1.525', 'learning_rate': '4.999e-05', 'epoch': '0.1466', 'num_input_tokens_seen': 11917634, 'train_runtime': '6028', 'train_tokens_per_second': '1977'} +{'loss': '1.312', 'grad_norm': '1.75', 'learning_rate': '4.999e-05', 'epoch': '0.1466', 'num_input_tokens_seen': 11919681, 'train_runtime': '6029', 'train_tokens_per_second': '1977'} +{'loss': '2.06', 'grad_norm': '2.529', 'learning_rate': '4.999e-05', 'epoch': '0.1466', 'num_input_tokens_seen': 11921728, 'train_runtime': '6030', 'train_tokens_per_second': '1977'} +{'loss': '1.341', 'grad_norm': '2.129', 'learning_rate': '4.999e-05', 'epoch': '0.1467', 'num_input_tokens_seen': 11923775, 'train_runtime': '6031', 'train_tokens_per_second': '1977'} +{'loss': '1.444', 'grad_norm': '2.045', 'learning_rate': '4.999e-05', 'epoch': '0.1467', 'num_input_tokens_seen': 11925822, 'train_runtime': '6033', 'train_tokens_per_second': '1977'} +{'loss': '0.4548', 'grad_norm': '1.213', 'learning_rate': '4.999e-05', 'epoch': '0.1467', 'num_input_tokens_seen': 11927869, 'train_runtime': '6034', 'train_tokens_per_second': '1977'} +{'loss': '0.4674', 'grad_norm': '0.9363', 'learning_rate': '4.999e-05', 'epoch': '0.1467', 'num_input_tokens_seen': 11929916, 'train_runtime': '6035', 'train_tokens_per_second': '1977'} +{'loss': '0.5831', 'grad_norm': '1.456', 'learning_rate': '4.999e-05', 'epoch': '0.1468', 'num_input_tokens_seen': 11931963, 'train_runtime': '6036', 'train_tokens_per_second': '1977'} +{'loss': '1.832', 'grad_norm': '2.34', 'learning_rate': '4.999e-05', 'epoch': '0.1468', 'num_input_tokens_seen': 11934010, 'train_runtime': '6037', 'train_tokens_per_second': '1977'} +{'loss': '0.5787', 'grad_norm': '1.062', 'learning_rate': '4.999e-05', 'epoch': '0.1468', 'num_input_tokens_seen': 11936057, 'train_runtime': '6038', 'train_tokens_per_second': '1977'} +{'loss': '1.736', 'grad_norm': '2.156', 'learning_rate': '4.999e-05', 'epoch': '0.1468', 'num_input_tokens_seen': 11938104, 'train_runtime': '6039', 'train_tokens_per_second': '1977'} +{'loss': '0.7031', 'grad_norm': '1.305', 'learning_rate': '4.999e-05', 'epoch': '0.1469', 'num_input_tokens_seen': 11940151, 'train_runtime': '6040', 'train_tokens_per_second': '1977'} +{'loss': '0.4549', 'grad_norm': '1.016', 'learning_rate': '4.999e-05', 'epoch': '0.1469', 'num_input_tokens_seen': 11942198, 'train_runtime': '6041', 'train_tokens_per_second': '1977'} +{'loss': '0.6704', 'grad_norm': '1.296', 'learning_rate': '4.999e-05', 'epoch': '0.1469', 'num_input_tokens_seen': 11944245, 'train_runtime': '6042', 'train_tokens_per_second': '1977'} +{'loss': '0.953', 'grad_norm': '1.877', 'learning_rate': '4.999e-05', 'epoch': '0.1469', 'num_input_tokens_seen': 11946292, 'train_runtime': '6043', 'train_tokens_per_second': '1977'} +{'loss': '0.3398', 'grad_norm': '1.102', 'learning_rate': '4.999e-05', 'epoch': '0.147', 'num_input_tokens_seen': 11948339, 'train_runtime': '6044', 'train_tokens_per_second': '1977'} +{'loss': '0.2452', 'grad_norm': '0.976', 'learning_rate': '4.999e-05', 'epoch': '0.147', 'num_input_tokens_seen': 11950386, 'train_runtime': '6045', 'train_tokens_per_second': '1977'} +{'loss': '1.26', 'grad_norm': '1.87', 'learning_rate': '4.999e-05', 'epoch': '0.147', 'num_input_tokens_seen': 11952433, 'train_runtime': '6046', 'train_tokens_per_second': '1977'} +{'loss': '0.2628', 'grad_norm': '1.074', 'learning_rate': '4.999e-05', 'epoch': '0.147', 'num_input_tokens_seen': 11954480, 'train_runtime': '6047', 'train_tokens_per_second': '1977'} +{'loss': '0.805', 'grad_norm': '1.647', 'learning_rate': '4.999e-05', 'epoch': '0.1471', 'num_input_tokens_seen': 11956527, 'train_runtime': '6048', 'train_tokens_per_second': '1977'} +{'loss': '0.8883', 'grad_norm': '1.771', 'learning_rate': '4.999e-05', 'epoch': '0.1471', 'num_input_tokens_seen': 11958574, 'train_runtime': '6049', 'train_tokens_per_second': '1977'} +{'loss': '0.8514', 'grad_norm': '1.606', 'learning_rate': '4.999e-05', 'epoch': '0.1471', 'num_input_tokens_seen': 11960621, 'train_runtime': '6050', 'train_tokens_per_second': '1977'} +{'loss': '2.823', 'grad_norm': '2.572', 'learning_rate': '4.999e-05', 'epoch': '0.1471', 'num_input_tokens_seen': 11962668, 'train_runtime': '6051', 'train_tokens_per_second': '1977'} +{'loss': '0.8159', 'grad_norm': '1.434', 'learning_rate': '4.999e-05', 'epoch': '0.1472', 'num_input_tokens_seen': 11964715, 'train_runtime': '6052', 'train_tokens_per_second': '1977'} +{'loss': '0.2755', 'grad_norm': '1.309', 'learning_rate': '4.999e-05', 'epoch': '0.1472', 'num_input_tokens_seen': 11966762, 'train_runtime': '6053', 'train_tokens_per_second': '1977'} +{'loss': '2.468', 'grad_norm': '2.94', 'learning_rate': '4.999e-05', 'epoch': '0.1472', 'num_input_tokens_seen': 11968809, 'train_runtime': '6054', 'train_tokens_per_second': '1977'} +{'loss': '1.871', 'grad_norm': '2.372', 'learning_rate': '4.999e-05', 'epoch': '0.1472', 'num_input_tokens_seen': 11970856, 'train_runtime': '6055', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '1.762', 'learning_rate': '4.999e-05', 'epoch': '0.1473', 'num_input_tokens_seen': 11972903, 'train_runtime': '6056', 'train_tokens_per_second': '1977'} +{'loss': '2.016', 'grad_norm': '2.444', 'learning_rate': '4.999e-05', 'epoch': '0.1473', 'num_input_tokens_seen': 11974950, 'train_runtime': '6057', 'train_tokens_per_second': '1977'} +{'loss': '0.4612', 'grad_norm': '1.104', 'learning_rate': '4.999e-05', 'epoch': '0.1473', 'num_input_tokens_seen': 11976997, 'train_runtime': '6058', 'train_tokens_per_second': '1977'} +{'loss': '0.5846', 'grad_norm': '1.438', 'learning_rate': '4.999e-05', 'epoch': '0.1473', 'num_input_tokens_seen': 11979044, 'train_runtime': '6059', 'train_tokens_per_second': '1977'} +{'loss': '0.4042', 'grad_norm': '1.3', 'learning_rate': '4.999e-05', 'epoch': '0.1474', 'num_input_tokens_seen': 11981091, 'train_runtime': '6060', 'train_tokens_per_second': '1977'} +{'loss': '0.3729', 'grad_norm': '1.083', 'learning_rate': '4.999e-05', 'epoch': '0.1474', 'num_input_tokens_seen': 11983138, 'train_runtime': '6061', 'train_tokens_per_second': '1977'} +{'loss': '0.3601', 'grad_norm': '1.085', 'learning_rate': '4.999e-05', 'epoch': '0.1474', 'num_input_tokens_seen': 11985185, 'train_runtime': '6063', 'train_tokens_per_second': '1977'} +{'loss': '0.5284', 'grad_norm': '1.138', 'learning_rate': '4.999e-05', 'epoch': '0.1474', 'num_input_tokens_seen': 11987232, 'train_runtime': '6064', 'train_tokens_per_second': '1977'} +{'loss': '1.093', 'grad_norm': '2.078', 'learning_rate': '4.999e-05', 'epoch': '0.1475', 'num_input_tokens_seen': 11989279, 'train_runtime': '6065', 'train_tokens_per_second': '1977'} +{'loss': '0.818', 'grad_norm': '1.74', 'learning_rate': '4.999e-05', 'epoch': '0.1475', 'num_input_tokens_seen': 11991326, 'train_runtime': '6066', 'train_tokens_per_second': '1977'} +{'loss': '0.633', 'grad_norm': '1.5', 'learning_rate': '4.999e-05', 'epoch': '0.1475', 'num_input_tokens_seen': 11993373, 'train_runtime': '6067', 'train_tokens_per_second': '1977'} +{'loss': '0.3917', 'grad_norm': '0.9109', 'learning_rate': '4.999e-05', 'epoch': '0.1475', 'num_input_tokens_seen': 11995420, 'train_runtime': '6068', 'train_tokens_per_second': '1977'} +{'loss': '0.6536', 'grad_norm': '1.386', 'learning_rate': '4.999e-05', 'epoch': '0.1476', 'num_input_tokens_seen': 11997467, 'train_runtime': '6069', 'train_tokens_per_second': '1977'} +{'loss': '0.9468', 'grad_norm': '1.655', 'learning_rate': '4.999e-05', 'epoch': '0.1476', 'num_input_tokens_seen': 11999514, 'train_runtime': '6070', 'train_tokens_per_second': '1977'} +{'loss': '1.073', 'grad_norm': '1.761', 'learning_rate': '4.999e-05', 'epoch': '0.1476', 'num_input_tokens_seen': 12001561, 'train_runtime': '6071', 'train_tokens_per_second': '1977'} +{'loss': '1.914', 'grad_norm': '2.098', 'learning_rate': '4.999e-05', 'epoch': '0.1476', 'num_input_tokens_seen': 12003608, 'train_runtime': '6072', 'train_tokens_per_second': '1977'} +{'loss': '0.2698', 'grad_norm': '1.017', 'learning_rate': '4.999e-05', 'epoch': '0.1477', 'num_input_tokens_seen': 12005655, 'train_runtime': '6073', 'train_tokens_per_second': '1977'} +{'loss': '0.7863', 'grad_norm': '1.322', 'learning_rate': '4.999e-05', 'epoch': '0.1477', 'num_input_tokens_seen': 12007702, 'train_runtime': '6074', 'train_tokens_per_second': '1977'} +{'loss': '1.833', 'grad_norm': '2.455', 'learning_rate': '4.999e-05', 'epoch': '0.1477', 'num_input_tokens_seen': 12009749, 'train_runtime': '6075', 'train_tokens_per_second': '1977'} +{'loss': '0.3504', 'grad_norm': '1.245', 'learning_rate': '4.999e-05', 'epoch': '0.1477', 'num_input_tokens_seen': 12011796, 'train_runtime': '6076', 'train_tokens_per_second': '1977'} +{'loss': '0.532', 'grad_norm': '1.298', 'learning_rate': '4.999e-05', 'epoch': '0.1478', 'num_input_tokens_seen': 12013843, 'train_runtime': '6077', 'train_tokens_per_second': '1977'} +{'loss': '1.339', 'grad_norm': '1.821', 'learning_rate': '4.999e-05', 'epoch': '0.1478', 'num_input_tokens_seen': 12015890, 'train_runtime': '6078', 'train_tokens_per_second': '1977'} +{'loss': '0.5639', 'grad_norm': '1.3', 'learning_rate': '4.999e-05', 'epoch': '0.1478', 'num_input_tokens_seen': 12017937, 'train_runtime': '6079', 'train_tokens_per_second': '1977'} +{'loss': '2.426', 'grad_norm': '2.602', 'learning_rate': '4.999e-05', 'epoch': '0.1478', 'num_input_tokens_seen': 12019984, 'train_runtime': '6080', 'train_tokens_per_second': '1977'} +{'loss': '0.8617', 'grad_norm': '1.622', 'learning_rate': '4.999e-05', 'epoch': '0.1479', 'num_input_tokens_seen': 12022031, 'train_runtime': '6081', 'train_tokens_per_second': '1977'} +{'loss': '1.362', 'grad_norm': '2.126', 'learning_rate': '4.999e-05', 'epoch': '0.1479', 'num_input_tokens_seen': 12024078, 'train_runtime': '6082', 'train_tokens_per_second': '1977'} +{'loss': '0.6179', 'grad_norm': '1.335', 'learning_rate': '4.999e-05', 'epoch': '0.1479', 'num_input_tokens_seen': 12026125, 'train_runtime': '6083', 'train_tokens_per_second': '1977'} +{'loss': '1.062', 'grad_norm': '1.567', 'learning_rate': '4.999e-05', 'epoch': '0.1479', 'num_input_tokens_seen': 12028172, 'train_runtime': '6084', 'train_tokens_per_second': '1977'} +{'loss': '0.5718', 'grad_norm': '1.391', 'learning_rate': '4.999e-05', 'epoch': '0.148', 'num_input_tokens_seen': 12030219, 'train_runtime': '6085', 'train_tokens_per_second': '1977'} +{'loss': '0.8279', 'grad_norm': '1.503', 'learning_rate': '4.999e-05', 'epoch': '0.148', 'num_input_tokens_seen': 12032266, 'train_runtime': '6086', 'train_tokens_per_second': '1977'} +{'loss': '1.324', 'grad_norm': '1.663', 'learning_rate': '4.999e-05', 'epoch': '0.148', 'num_input_tokens_seen': 12034313, 'train_runtime': '6087', 'train_tokens_per_second': '1977'} +{'loss': '0.377', 'grad_norm': '1.202', 'learning_rate': '4.999e-05', 'epoch': '0.148', 'num_input_tokens_seen': 12036360, 'train_runtime': '6088', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '1.501', 'learning_rate': '4.999e-05', 'epoch': '0.1481', 'num_input_tokens_seen': 12038407, 'train_runtime': '6089', 'train_tokens_per_second': '1977'} +{'loss': '0.3777', 'grad_norm': '0.8816', 'learning_rate': '4.999e-05', 'epoch': '0.1481', 'num_input_tokens_seen': 12040454, 'train_runtime': '6090', 'train_tokens_per_second': '1977'} +{'loss': '0.9572', 'grad_norm': '1.408', 'learning_rate': '4.999e-05', 'epoch': '0.1481', 'num_input_tokens_seen': 12042501, 'train_runtime': '6091', 'train_tokens_per_second': '1977'} +{'loss': '1.252', 'grad_norm': '1.786', 'learning_rate': '4.999e-05', 'epoch': '0.1481', 'num_input_tokens_seen': 12044548, 'train_runtime': '6093', 'train_tokens_per_second': '1977'} +{'loss': '0.6782', 'grad_norm': '1.553', 'learning_rate': '4.999e-05', 'epoch': '0.1482', 'num_input_tokens_seen': 12046595, 'train_runtime': '6094', 'train_tokens_per_second': '1977'} +{'loss': '0.6525', 'grad_norm': '1.626', 'learning_rate': '4.999e-05', 'epoch': '0.1482', 'num_input_tokens_seen': 12048642, 'train_runtime': '6095', 'train_tokens_per_second': '1977'} +{'loss': '1.483', 'grad_norm': '2.085', 'learning_rate': '4.999e-05', 'epoch': '0.1482', 'num_input_tokens_seen': 12050689, 'train_runtime': '6096', 'train_tokens_per_second': '1977'} +{'loss': '0.7186', 'grad_norm': '1.681', 'learning_rate': '4.999e-05', 'epoch': '0.1482', 'num_input_tokens_seen': 12052736, 'train_runtime': '6097', 'train_tokens_per_second': '1977'} +{'loss': '0.4033', 'grad_norm': '1.235', 'learning_rate': '4.999e-05', 'epoch': '0.1483', 'num_input_tokens_seen': 12054783, 'train_runtime': '6098', 'train_tokens_per_second': '1977'} +{'loss': '1.749', 'grad_norm': '2.159', 'learning_rate': '4.999e-05', 'epoch': '0.1483', 'num_input_tokens_seen': 12056830, 'train_runtime': '6099', 'train_tokens_per_second': '1977'} +{'loss': '2.197', 'grad_norm': '2.597', 'learning_rate': '4.999e-05', 'epoch': '0.1483', 'num_input_tokens_seen': 12058877, 'train_runtime': '6100', 'train_tokens_per_second': '1977'} +{'loss': '0.6925', 'grad_norm': '1.323', 'learning_rate': '4.999e-05', 'epoch': '0.1483', 'num_input_tokens_seen': 12060924, 'train_runtime': '6101', 'train_tokens_per_second': '1977'} +{'loss': '0.7737', 'grad_norm': '1.176', 'learning_rate': '4.999e-05', 'epoch': '0.1484', 'num_input_tokens_seen': 12062971, 'train_runtime': '6102', 'train_tokens_per_second': '1977'} +{'loss': '1.661', 'grad_norm': '2.441', 'learning_rate': '4.999e-05', 'epoch': '0.1484', 'num_input_tokens_seen': 12065018, 'train_runtime': '6103', 'train_tokens_per_second': '1977'} +{'loss': '0.3258', 'grad_norm': '1.142', 'learning_rate': '4.999e-05', 'epoch': '0.1484', 'num_input_tokens_seen': 12067065, 'train_runtime': '6104', 'train_tokens_per_second': '1977'} +{'loss': '0.6535', 'grad_norm': '1.495', 'learning_rate': '4.999e-05', 'epoch': '0.1485', 'num_input_tokens_seen': 12069112, 'train_runtime': '6105', 'train_tokens_per_second': '1977'} +{'loss': '1.697', 'grad_norm': '2.366', 'learning_rate': '4.999e-05', 'epoch': '0.1485', 'num_input_tokens_seen': 12071159, 'train_runtime': '6106', 'train_tokens_per_second': '1977'} +{'loss': '1.759', 'grad_norm': '2.592', 'learning_rate': '4.999e-05', 'epoch': '0.1485', 'num_input_tokens_seen': 12073206, 'train_runtime': '6107', 'train_tokens_per_second': '1977'} +{'loss': '0.4153', 'grad_norm': '1.02', 'learning_rate': '4.999e-05', 'epoch': '0.1485', 'num_input_tokens_seen': 12075253, 'train_runtime': '6108', 'train_tokens_per_second': '1977'} +{'loss': '1.744', 'grad_norm': '2.457', 'learning_rate': '4.999e-05', 'epoch': '0.1486', 'num_input_tokens_seen': 12077300, 'train_runtime': '6109', 'train_tokens_per_second': '1977'} +{'loss': '0.7803', 'grad_norm': '1.611', 'learning_rate': '4.999e-05', 'epoch': '0.1486', 'num_input_tokens_seen': 12079347, 'train_runtime': '6110', 'train_tokens_per_second': '1977'} +{'loss': '0.4261', 'grad_norm': '1.176', 'learning_rate': '4.999e-05', 'epoch': '0.1486', 'num_input_tokens_seen': 12081394, 'train_runtime': '6111', 'train_tokens_per_second': '1977'} +{'loss': '1.599', 'grad_norm': '2.054', 'learning_rate': '4.999e-05', 'epoch': '0.1486', 'num_input_tokens_seen': 12083441, 'train_runtime': '6112', 'train_tokens_per_second': '1977'} +{'loss': '0.8499', 'grad_norm': '1.681', 'learning_rate': '4.999e-05', 'epoch': '0.1487', 'num_input_tokens_seen': 12085488, 'train_runtime': '6113', 'train_tokens_per_second': '1977'} +{'loss': '1.757', 'grad_norm': '2.525', 'learning_rate': '4.999e-05', 'epoch': '0.1487', 'num_input_tokens_seen': 12087535, 'train_runtime': '6114', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '1.32', 'learning_rate': '4.999e-05', 'epoch': '0.1487', 'num_input_tokens_seen': 12089582, 'train_runtime': '6115', 'train_tokens_per_second': '1977'} +{'loss': '0.6684', 'grad_norm': '1.426', 'learning_rate': '4.999e-05', 'epoch': '0.1487', 'num_input_tokens_seen': 12091629, 'train_runtime': '6116', 'train_tokens_per_second': '1977'} +{'loss': '0.3761', 'grad_norm': '0.8944', 'learning_rate': '4.999e-05', 'epoch': '0.1488', 'num_input_tokens_seen': 12093676, 'train_runtime': '6117', 'train_tokens_per_second': '1977'} +{'loss': '0.2623', 'grad_norm': '0.9845', 'learning_rate': '4.999e-05', 'epoch': '0.1488', 'num_input_tokens_seen': 12095723, 'train_runtime': '6118', 'train_tokens_per_second': '1977'} +{'loss': '0.656', 'grad_norm': '1.607', 'learning_rate': '4.999e-05', 'epoch': '0.1488', 'num_input_tokens_seen': 12097770, 'train_runtime': '6119', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '1.619', 'learning_rate': '4.999e-05', 'epoch': '0.1488', 'num_input_tokens_seen': 12099817, 'train_runtime': '6120', 'train_tokens_per_second': '1977'} +{'loss': '0.8609', 'grad_norm': '1.377', 'learning_rate': '4.999e-05', 'epoch': '0.1489', 'num_input_tokens_seen': 12101864, 'train_runtime': '6121', 'train_tokens_per_second': '1977'} +{'loss': '0.4247', 'grad_norm': '1.108', 'learning_rate': '4.999e-05', 'epoch': '0.1489', 'num_input_tokens_seen': 12103911, 'train_runtime': '6122', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '1.715', 'learning_rate': '4.999e-05', 'epoch': '0.1489', 'num_input_tokens_seen': 12105958, 'train_runtime': '6123', 'train_tokens_per_second': '1977'} +{'loss': '0.6607', 'grad_norm': '1.579', 'learning_rate': '4.999e-05', 'epoch': '0.1489', 'num_input_tokens_seen': 12108005, 'train_runtime': '6125', 'train_tokens_per_second': '1977'} +{'loss': '0.7996', 'grad_norm': '1.491', 'learning_rate': '4.999e-05', 'epoch': '0.149', 'num_input_tokens_seen': 12110052, 'train_runtime': '6126', 'train_tokens_per_second': '1977'} +{'loss': '1.824', 'grad_norm': '2.319', 'learning_rate': '4.999e-05', 'epoch': '0.149', 'num_input_tokens_seen': 12112099, 'train_runtime': '6127', 'train_tokens_per_second': '1977'} +{'loss': '0.6155', 'grad_norm': '1.41', 'learning_rate': '4.999e-05', 'epoch': '0.149', 'num_input_tokens_seen': 12114146, 'train_runtime': '6128', 'train_tokens_per_second': '1977'} +{'loss': '1.053', 'grad_norm': '1.653', 'learning_rate': '4.999e-05', 'epoch': '0.149', 'num_input_tokens_seen': 12116193, 'train_runtime': '6129', 'train_tokens_per_second': '1977'} +{'loss': '2.259', 'grad_norm': '2.597', 'learning_rate': '4.999e-05', 'epoch': '0.1491', 'num_input_tokens_seen': 12118240, 'train_runtime': '6130', 'train_tokens_per_second': '1977'} +{'loss': '0.3459', 'grad_norm': '1.161', 'learning_rate': '4.999e-05', 'epoch': '0.1491', 'num_input_tokens_seen': 12120287, 'train_runtime': '6131', 'train_tokens_per_second': '1977'} +{'loss': '0.8978', 'grad_norm': '2.259', 'learning_rate': '4.999e-05', 'epoch': '0.1491', 'num_input_tokens_seen': 12122334, 'train_runtime': '6132', 'train_tokens_per_second': '1977'} +{'loss': '0.7826', 'grad_norm': '1.719', 'learning_rate': '4.999e-05', 'epoch': '0.1491', 'num_input_tokens_seen': 12124381, 'train_runtime': '6133', 'train_tokens_per_second': '1977'} +{'loss': '1.301', 'grad_norm': '1.673', 'learning_rate': '4.999e-05', 'epoch': '0.1492', 'num_input_tokens_seen': 12126428, 'train_runtime': '6134', 'train_tokens_per_second': '1977'} +{'loss': '0.9459', 'grad_norm': '1.756', 'learning_rate': '4.999e-05', 'epoch': '0.1492', 'num_input_tokens_seen': 12128475, 'train_runtime': '6135', 'train_tokens_per_second': '1977'} +{'loss': '1.413', 'grad_norm': '2.493', 'learning_rate': '4.999e-05', 'epoch': '0.1492', 'num_input_tokens_seen': 12130522, 'train_runtime': '6136', 'train_tokens_per_second': '1977'} +{'loss': '0.9749', 'grad_norm': '1.464', 'learning_rate': '4.999e-05', 'epoch': '0.1492', 'num_input_tokens_seen': 12132569, 'train_runtime': '6137', 'train_tokens_per_second': '1977'} +{'loss': '0.4412', 'grad_norm': '1.024', 'learning_rate': '4.999e-05', 'epoch': '0.1493', 'num_input_tokens_seen': 12134616, 'train_runtime': '6138', 'train_tokens_per_second': '1977'} +{'loss': '1.803', 'grad_norm': '2.417', 'learning_rate': '4.999e-05', 'epoch': '0.1493', 'num_input_tokens_seen': 12136663, 'train_runtime': '6139', 'train_tokens_per_second': '1977'} +{'loss': '1.441', 'grad_norm': '2.047', 'learning_rate': '4.999e-05', 'epoch': '0.1493', 'num_input_tokens_seen': 12138710, 'train_runtime': '6140', 'train_tokens_per_second': '1977'} +{'loss': '1.378', 'grad_norm': '1.792', 'learning_rate': '4.999e-05', 'epoch': '0.1493', 'num_input_tokens_seen': 12140757, 'train_runtime': '6141', 'train_tokens_per_second': '1977'} +{'loss': '1.162', 'grad_norm': '1.935', 'learning_rate': '4.999e-05', 'epoch': '0.1494', 'num_input_tokens_seen': 12142804, 'train_runtime': '6142', 'train_tokens_per_second': '1977'} +{'loss': '0.5901', 'grad_norm': '1.508', 'learning_rate': '4.999e-05', 'epoch': '0.1494', 'num_input_tokens_seen': 12144851, 'train_runtime': '6143', 'train_tokens_per_second': '1977'} +{'loss': '0.337', 'grad_norm': '1.055', 'learning_rate': '4.999e-05', 'epoch': '0.1494', 'num_input_tokens_seen': 12146898, 'train_runtime': '6144', 'train_tokens_per_second': '1977'} +{'loss': '1.461', 'grad_norm': '2.308', 'learning_rate': '4.999e-05', 'epoch': '0.1494', 'num_input_tokens_seen': 12148945, 'train_runtime': '6145', 'train_tokens_per_second': '1977'} +{'loss': '0.825', 'grad_norm': '1.488', 'learning_rate': '4.999e-05', 'epoch': '0.1495', 'num_input_tokens_seen': 12150992, 'train_runtime': '6146', 'train_tokens_per_second': '1977'} +{'loss': '0.7941', 'grad_norm': '1.495', 'learning_rate': '4.999e-05', 'epoch': '0.1495', 'num_input_tokens_seen': 12153039, 'train_runtime': '6147', 'train_tokens_per_second': '1977'} +{'loss': '1.199', 'grad_norm': '1.942', 'learning_rate': '4.999e-05', 'epoch': '0.1495', 'num_input_tokens_seen': 12155086, 'train_runtime': '6148', 'train_tokens_per_second': '1977'} +{'loss': '0.9527', 'grad_norm': '1.494', 'learning_rate': '4.999e-05', 'epoch': '0.1495', 'num_input_tokens_seen': 12157133, 'train_runtime': '6149', 'train_tokens_per_second': '1977'} +{'loss': '0.6957', 'grad_norm': '1.327', 'learning_rate': '4.999e-05', 'epoch': '0.1496', 'num_input_tokens_seen': 12159180, 'train_runtime': '6150', 'train_tokens_per_second': '1977'} +{'loss': '0.2926', 'grad_norm': '1.128', 'learning_rate': '4.999e-05', 'epoch': '0.1496', 'num_input_tokens_seen': 12161227, 'train_runtime': '6151', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '1.594', 'learning_rate': '4.999e-05', 'epoch': '0.1496', 'num_input_tokens_seen': 12163274, 'train_runtime': '6152', 'train_tokens_per_second': '1977'} +{'loss': '0.9477', 'grad_norm': '2.058', 'learning_rate': '4.999e-05', 'epoch': '0.1496', 'num_input_tokens_seen': 12165321, 'train_runtime': '6153', 'train_tokens_per_second': '1977'} +{'loss': '1.943', 'grad_norm': '2.282', 'learning_rate': '4.999e-05', 'epoch': '0.1497', 'num_input_tokens_seen': 12167368, 'train_runtime': '6154', 'train_tokens_per_second': '1977'} +{'loss': '0.7752', 'grad_norm': '1.515', 'learning_rate': '4.999e-05', 'epoch': '0.1497', 'num_input_tokens_seen': 12169415, 'train_runtime': '6156', 'train_tokens_per_second': '1977'} +{'loss': '0.3573', 'grad_norm': '1.344', 'learning_rate': '4.999e-05', 'epoch': '0.1497', 'num_input_tokens_seen': 12171462, 'train_runtime': '6157', 'train_tokens_per_second': '1977'} +{'loss': '0.386', 'grad_norm': '1.058', 'learning_rate': '4.999e-05', 'epoch': '0.1497', 'num_input_tokens_seen': 12173509, 'train_runtime': '6158', 'train_tokens_per_second': '1977'} +{'loss': '0.5384', 'grad_norm': '1.209', 'learning_rate': '4.999e-05', 'epoch': '0.1498', 'num_input_tokens_seen': 12175556, 'train_runtime': '6159', 'train_tokens_per_second': '1977'} +{'loss': '1.05', 'grad_norm': '1.773', 'learning_rate': '4.999e-05', 'epoch': '0.1498', 'num_input_tokens_seen': 12177603, 'train_runtime': '6160', 'train_tokens_per_second': '1977'} +{'loss': '0.509', 'grad_norm': '1.118', 'learning_rate': '4.999e-05', 'epoch': '0.1498', 'num_input_tokens_seen': 12179650, 'train_runtime': '6161', 'train_tokens_per_second': '1977'} +{'loss': '0.4985', 'grad_norm': '1.891', 'learning_rate': '4.999e-05', 'epoch': '0.1498', 'num_input_tokens_seen': 12181697, 'train_runtime': '6162', 'train_tokens_per_second': '1977'} +{'loss': '0.341', 'grad_norm': '1.185', 'learning_rate': '4.999e-05', 'epoch': '0.1499', 'num_input_tokens_seen': 12183744, 'train_runtime': '6163', 'train_tokens_per_second': '1977'} +{'loss': '0.3302', 'grad_norm': '0.9265', 'learning_rate': '4.999e-05', 'epoch': '0.1499', 'num_input_tokens_seen': 12185791, 'train_runtime': '6164', 'train_tokens_per_second': '1977'} +{'loss': '1.83', 'grad_norm': '2.684', 'learning_rate': '4.999e-05', 'epoch': '0.1499', 'num_input_tokens_seen': 12187838, 'train_runtime': '6165', 'train_tokens_per_second': '1977'} +{'loss': '1.419', 'grad_norm': '2.312', 'learning_rate': '4.999e-05', 'epoch': '0.1499', 'num_input_tokens_seen': 12189885, 'train_runtime': '6166', 'train_tokens_per_second': '1977'} +{'loss': '1.357', 'grad_norm': '2.227', 'learning_rate': '4.999e-05', 'epoch': '0.15', 'num_input_tokens_seen': 12191932, 'train_runtime': '6167', 'train_tokens_per_second': '1977'} +{'loss': '0.5799', 'grad_norm': '1.311', 'learning_rate': '4.999e-05', 'epoch': '0.15', 'num_input_tokens_seen': 12193979, 'train_runtime': '6168', 'train_tokens_per_second': '1977'} +{'loss': '1.216', 'grad_norm': '1.913', 'learning_rate': '4.999e-05', 'epoch': '0.15', 'num_input_tokens_seen': 12196026, 'train_runtime': '6169', 'train_tokens_per_second': '1977'} +{'loss': '2.043', 'grad_norm': '2.641', 'learning_rate': '4.999e-05', 'epoch': '0.15', 'num_input_tokens_seen': 12198073, 'train_runtime': '6170', 'train_tokens_per_second': '1977'} +{'loss': '0.2503', 'grad_norm': '1.177', 'learning_rate': '4.999e-05', 'epoch': '0.1501', 'num_input_tokens_seen': 12200120, 'train_runtime': '6171', 'train_tokens_per_second': '1977'} +{'loss': '0.8655', 'grad_norm': '1.382', 'learning_rate': '4.999e-05', 'epoch': '0.1501', 'num_input_tokens_seen': 12202167, 'train_runtime': '6172', 'train_tokens_per_second': '1977'} +{'loss': '0.8349', 'grad_norm': '1.16', 'learning_rate': '4.999e-05', 'epoch': '0.1501', 'num_input_tokens_seen': 12204214, 'train_runtime': '6173', 'train_tokens_per_second': '1977'} +{'loss': '1.592', 'grad_norm': '2.04', 'learning_rate': '4.999e-05', 'epoch': '0.1501', 'num_input_tokens_seen': 12206261, 'train_runtime': '6174', 'train_tokens_per_second': '1977'} +{'loss': '0.5668', 'grad_norm': '1.173', 'learning_rate': '4.999e-05', 'epoch': '0.1502', 'num_input_tokens_seen': 12208308, 'train_runtime': '6175', 'train_tokens_per_second': '1977'} +{'loss': '1.59', 'grad_norm': '2.884', 'learning_rate': '4.999e-05', 'epoch': '0.1502', 'num_input_tokens_seen': 12210355, 'train_runtime': '6176', 'train_tokens_per_second': '1977'} +{'loss': '1.443', 'grad_norm': '2.012', 'learning_rate': '4.999e-05', 'epoch': '0.1502', 'num_input_tokens_seen': 12212402, 'train_runtime': '6177', 'train_tokens_per_second': '1977'} +{'loss': '0.8653', 'grad_norm': '1.62', 'learning_rate': '4.999e-05', 'epoch': '0.1502', 'num_input_tokens_seen': 12214449, 'train_runtime': '6178', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '2.062', 'learning_rate': '4.999e-05', 'epoch': '0.1503', 'num_input_tokens_seen': 12216496, 'train_runtime': '6179', 'train_tokens_per_second': '1977'} +{'loss': '0.4066', 'grad_norm': '0.913', 'learning_rate': '4.999e-05', 'epoch': '0.1503', 'num_input_tokens_seen': 12218543, 'train_runtime': '6180', 'train_tokens_per_second': '1977'} +{'loss': '0.3773', 'grad_norm': '0.9027', 'learning_rate': '4.999e-05', 'epoch': '0.1503', 'num_input_tokens_seen': 12220590, 'train_runtime': '6181', 'train_tokens_per_second': '1977'} +{'loss': '0.6043', 'grad_norm': '1.634', 'learning_rate': '4.999e-05', 'epoch': '0.1503', 'num_input_tokens_seen': 12222637, 'train_runtime': '6182', 'train_tokens_per_second': '1977'} +{'loss': '0.4396', 'grad_norm': '1.078', 'learning_rate': '4.999e-05', 'epoch': '0.1504', 'num_input_tokens_seen': 12224684, 'train_runtime': '6183', 'train_tokens_per_second': '1977'} +{'loss': '1.14', 'grad_norm': '1.738', 'learning_rate': '4.999e-05', 'epoch': '0.1504', 'num_input_tokens_seen': 12226731, 'train_runtime': '6184', 'train_tokens_per_second': '1977'} +{'loss': '0.9596', 'grad_norm': '1.5', 'learning_rate': '4.999e-05', 'epoch': '0.1504', 'num_input_tokens_seen': 12228778, 'train_runtime': '6185', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '1.524', 'learning_rate': '4.999e-05', 'epoch': '0.1504', 'num_input_tokens_seen': 12230825, 'train_runtime': '6187', 'train_tokens_per_second': '1977'} +{'loss': '1.846', 'grad_norm': '2.584', 'learning_rate': '4.999e-05', 'epoch': '0.1505', 'num_input_tokens_seen': 12232872, 'train_runtime': '6188', 'train_tokens_per_second': '1977'} +{'loss': '0.9299', 'grad_norm': '1.589', 'learning_rate': '4.999e-05', 'epoch': '0.1505', 'num_input_tokens_seen': 12234919, 'train_runtime': '6189', 'train_tokens_per_second': '1977'} +{'loss': '2.262', 'grad_norm': '2.876', 'learning_rate': '4.999e-05', 'epoch': '0.1505', 'num_input_tokens_seen': 12236966, 'train_runtime': '6190', 'train_tokens_per_second': '1977'} +{'loss': '0.291', 'grad_norm': '1.003', 'learning_rate': '4.999e-05', 'epoch': '0.1505', 'num_input_tokens_seen': 12239013, 'train_runtime': '6191', 'train_tokens_per_second': '1977'} +{'loss': '0.6459', 'grad_norm': '1.496', 'learning_rate': '4.999e-05', 'epoch': '0.1506', 'num_input_tokens_seen': 12241060, 'train_runtime': '6192', 'train_tokens_per_second': '1977'} +{'loss': '0.6158', 'grad_norm': '1.217', 'learning_rate': '4.999e-05', 'epoch': '0.1506', 'num_input_tokens_seen': 12243107, 'train_runtime': '6193', 'train_tokens_per_second': '1977'} +{'loss': '1.129', 'grad_norm': '1.625', 'learning_rate': '4.999e-05', 'epoch': '0.1506', 'num_input_tokens_seen': 12245154, 'train_runtime': '6194', 'train_tokens_per_second': '1977'} +{'loss': '0.7349', 'grad_norm': '1.359', 'learning_rate': '4.999e-05', 'epoch': '0.1506', 'num_input_tokens_seen': 12247201, 'train_runtime': '6195', 'train_tokens_per_second': '1977'} +{'loss': '0.3355', 'grad_norm': '0.9558', 'learning_rate': '4.999e-05', 'epoch': '0.1507', 'num_input_tokens_seen': 12249248, 'train_runtime': '6196', 'train_tokens_per_second': '1977'} +{'loss': '0.7361', 'grad_norm': '1.453', 'learning_rate': '4.999e-05', 'epoch': '0.1507', 'num_input_tokens_seen': 12251295, 'train_runtime': '6197', 'train_tokens_per_second': '1977'} +{'loss': '0.3175', 'grad_norm': '1.058', 'learning_rate': '4.999e-05', 'epoch': '0.1507', 'num_input_tokens_seen': 12253342, 'train_runtime': '6198', 'train_tokens_per_second': '1977'} +{'loss': '0.5999', 'grad_norm': '1.353', 'learning_rate': '4.999e-05', 'epoch': '0.1507', 'num_input_tokens_seen': 12255389, 'train_runtime': '6199', 'train_tokens_per_second': '1977'} +{'loss': '0.9349', 'grad_norm': '1.432', 'learning_rate': '4.999e-05', 'epoch': '0.1508', 'num_input_tokens_seen': 12257436, 'train_runtime': '6200', 'train_tokens_per_second': '1977'} +{'loss': '0.7669', 'grad_norm': '1.401', 'learning_rate': '4.999e-05', 'epoch': '0.1508', 'num_input_tokens_seen': 12259483, 'train_runtime': '6201', 'train_tokens_per_second': '1977'} +{'loss': '0.5539', 'grad_norm': '1.101', 'learning_rate': '4.999e-05', 'epoch': '0.1508', 'num_input_tokens_seen': 12261530, 'train_runtime': '6202', 'train_tokens_per_second': '1977'} +{'loss': '0.5582', 'grad_norm': '1.235', 'learning_rate': '4.999e-05', 'epoch': '0.1508', 'num_input_tokens_seen': 12263577, 'train_runtime': '6203', 'train_tokens_per_second': '1977'} +{'loss': '0.3008', 'grad_norm': '1.064', 'learning_rate': '4.999e-05', 'epoch': '0.1509', 'num_input_tokens_seen': 12265624, 'train_runtime': '6204', 'train_tokens_per_second': '1977'} +{'loss': '0.4018', 'grad_norm': '1.134', 'learning_rate': '4.999e-05', 'epoch': '0.1509', 'num_input_tokens_seen': 12267671, 'train_runtime': '6205', 'train_tokens_per_second': '1977'} +{'loss': '0.7615', 'grad_norm': '1.589', 'learning_rate': '4.999e-05', 'epoch': '0.1509', 'num_input_tokens_seen': 12269718, 'train_runtime': '6206', 'train_tokens_per_second': '1977'} +{'loss': '0.3849', 'grad_norm': '1.368', 'learning_rate': '4.999e-05', 'epoch': '0.1509', 'num_input_tokens_seen': 12271765, 'train_runtime': '6207', 'train_tokens_per_second': '1977'} +{'loss': '0.7367', 'grad_norm': '1.445', 'learning_rate': '4.999e-05', 'epoch': '0.151', 'num_input_tokens_seen': 12273812, 'train_runtime': '6208', 'train_tokens_per_second': '1977'} +{'loss': '0.6791', 'grad_norm': '1.27', 'learning_rate': '4.999e-05', 'epoch': '0.151', 'num_input_tokens_seen': 12275859, 'train_runtime': '6209', 'train_tokens_per_second': '1977'} +{'loss': '2.358', 'grad_norm': '2.374', 'learning_rate': '4.999e-05', 'epoch': '0.151', 'num_input_tokens_seen': 12277906, 'train_runtime': '6210', 'train_tokens_per_second': '1977'} +{'loss': '1.707', 'grad_norm': '2.185', 'learning_rate': '4.999e-05', 'epoch': '0.151', 'num_input_tokens_seen': 12279953, 'train_runtime': '6211', 'train_tokens_per_second': '1977'} +{'loss': '1.654', 'grad_norm': '2.649', 'learning_rate': '4.999e-05', 'epoch': '0.1511', 'num_input_tokens_seen': 12282000, 'train_runtime': '6212', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 04:20:57,877 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 04:20:57,877 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 04:20:58,263 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-6000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 04:20:58,270 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-6000/tokenizer_config.json + +{'loss': '1.62', 'grad_norm': '1.977', 'learning_rate': '4.999e-05', 'epoch': '0.1511', 'num_input_tokens_seen': 12284047, 'train_runtime': '6214', 'train_tokens_per_second': '1977'} +{'loss': '0.7237', 'grad_norm': '1.363', 'learning_rate': '4.999e-05', 'epoch': '0.1511', 'num_input_tokens_seen': 12286094, 'train_runtime': '6215', 'train_tokens_per_second': '1977'} +{'loss': '0.9533', 'grad_norm': '1.574', 'learning_rate': '4.999e-05', 'epoch': '0.1511', 'num_input_tokens_seen': 12288141, 'train_runtime': '6216', 'train_tokens_per_second': '1977'} +{'loss': '0.6701', 'grad_norm': '1.3', 'learning_rate': '4.999e-05', 'epoch': '0.1512', 'num_input_tokens_seen': 12290188, 'train_runtime': '6217', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '2.134', 'learning_rate': '4.999e-05', 'epoch': '0.1512', 'num_input_tokens_seen': 12292235, 'train_runtime': '6218', 'train_tokens_per_second': '1977'} +{'loss': '0.8962', 'grad_norm': '1.381', 'learning_rate': '4.999e-05', 'epoch': '0.1512', 'num_input_tokens_seen': 12294282, 'train_runtime': '6219', 'train_tokens_per_second': '1977'} +{'loss': '0.9391', 'grad_norm': '1.646', 'learning_rate': '4.999e-05', 'epoch': '0.1512', 'num_input_tokens_seen': 12296329, 'train_runtime': '6220', 'train_tokens_per_second': '1977'} +{'loss': '0.3989', 'grad_norm': '1.025', 'learning_rate': '4.999e-05', 'epoch': '0.1513', 'num_input_tokens_seen': 12298376, 'train_runtime': '6221', 'train_tokens_per_second': '1977'} +{'loss': '2.892', 'grad_norm': '2.698', 'learning_rate': '4.999e-05', 'epoch': '0.1513', 'num_input_tokens_seen': 12300423, 'train_runtime': '6222', 'train_tokens_per_second': '1977'} +{'loss': '1.18', 'grad_norm': '1.745', 'learning_rate': '4.999e-05', 'epoch': '0.1513', 'num_input_tokens_seen': 12302470, 'train_runtime': '6223', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '1.848', 'learning_rate': '4.999e-05', 'epoch': '0.1513', 'num_input_tokens_seen': 12304517, 'train_runtime': '6224', 'train_tokens_per_second': '1977'} +{'loss': '1.418', 'grad_norm': '2.027', 'learning_rate': '4.999e-05', 'epoch': '0.1514', 'num_input_tokens_seen': 12306564, 'train_runtime': '6225', 'train_tokens_per_second': '1977'} +{'loss': '0.8662', 'grad_norm': '1.828', 'learning_rate': '4.999e-05', 'epoch': '0.1514', 'num_input_tokens_seen': 12308611, 'train_runtime': '6226', 'train_tokens_per_second': '1977'} +{'loss': '2.329', 'grad_norm': '2.548', 'learning_rate': '4.999e-05', 'epoch': '0.1514', 'num_input_tokens_seen': 12310658, 'train_runtime': '6227', 'train_tokens_per_second': '1977'} +{'loss': '1.543', 'grad_norm': '2.008', 'learning_rate': '4.999e-05', 'epoch': '0.1514', 'num_input_tokens_seen': 12312705, 'train_runtime': '6228', 'train_tokens_per_second': '1977'} +{'loss': '0.8298', 'grad_norm': '1.327', 'learning_rate': '4.999e-05', 'epoch': '0.1515', 'num_input_tokens_seen': 12314752, 'train_runtime': '6230', 'train_tokens_per_second': '1977'} +{'loss': '0.3822', 'grad_norm': '1.144', 'learning_rate': '4.999e-05', 'epoch': '0.1515', 'num_input_tokens_seen': 12316799, 'train_runtime': '6231', 'train_tokens_per_second': '1977'} +{'loss': '0.7797', 'grad_norm': '1.395', 'learning_rate': '4.999e-05', 'epoch': '0.1515', 'num_input_tokens_seen': 12318846, 'train_runtime': '6232', 'train_tokens_per_second': '1977'} +{'loss': '1.588', 'grad_norm': '2.262', 'learning_rate': '4.999e-05', 'epoch': '0.1515', 'num_input_tokens_seen': 12320893, 'train_runtime': '6233', 'train_tokens_per_second': '1977'} +{'loss': '0.6714', 'grad_norm': '1.55', 'learning_rate': '4.999e-05', 'epoch': '0.1516', 'num_input_tokens_seen': 12322940, 'train_runtime': '6234', 'train_tokens_per_second': '1977'} +{'loss': '1.237', 'grad_norm': '1.7', 'learning_rate': '4.999e-05', 'epoch': '0.1516', 'num_input_tokens_seen': 12324987, 'train_runtime': '6235', 'train_tokens_per_second': '1977'} +{'loss': '0.8417', 'grad_norm': '1.607', 'learning_rate': '4.999e-05', 'epoch': '0.1516', 'num_input_tokens_seen': 12327034, 'train_runtime': '6236', 'train_tokens_per_second': '1977'} +{'loss': '1.769', 'grad_norm': '2.689', 'learning_rate': '4.999e-05', 'epoch': '0.1516', 'num_input_tokens_seen': 12329081, 'train_runtime': '6237', 'train_tokens_per_second': '1977'} +{'loss': '0.6854', 'grad_norm': '1.47', 'learning_rate': '4.999e-05', 'epoch': '0.1517', 'num_input_tokens_seen': 12331128, 'train_runtime': '6238', 'train_tokens_per_second': '1977'} +{'loss': '0.6763', 'grad_norm': '1.105', 'learning_rate': '4.999e-05', 'epoch': '0.1517', 'num_input_tokens_seen': 12333175, 'train_runtime': '6239', 'train_tokens_per_second': '1977'} +{'loss': '0.3374', 'grad_norm': '1.138', 'learning_rate': '4.999e-05', 'epoch': '0.1517', 'num_input_tokens_seen': 12335222, 'train_runtime': '6240', 'train_tokens_per_second': '1977'} +{'loss': '0.5461', 'grad_norm': '1.441', 'learning_rate': '4.999e-05', 'epoch': '0.1517', 'num_input_tokens_seen': 12337269, 'train_runtime': '6241', 'train_tokens_per_second': '1977'} +{'loss': '0.396', 'grad_norm': '1.12', 'learning_rate': '4.999e-05', 'epoch': '0.1518', 'num_input_tokens_seen': 12339316, 'train_runtime': '6242', 'train_tokens_per_second': '1977'} +{'loss': '1.057', 'grad_norm': '2.071', 'learning_rate': '4.999e-05', 'epoch': '0.1518', 'num_input_tokens_seen': 12341363, 'train_runtime': '6243', 'train_tokens_per_second': '1977'} +{'loss': '1.359', 'grad_norm': '2.242', 'learning_rate': '4.999e-05', 'epoch': '0.1518', 'num_input_tokens_seen': 12343410, 'train_runtime': '6244', 'train_tokens_per_second': '1977'} +{'loss': '0.588', 'grad_norm': '1.313', 'learning_rate': '4.999e-05', 'epoch': '0.1518', 'num_input_tokens_seen': 12345457, 'train_runtime': '6245', 'train_tokens_per_second': '1977'} +{'loss': '0.9974', 'grad_norm': '1.773', 'learning_rate': '4.999e-05', 'epoch': '0.1519', 'num_input_tokens_seen': 12347504, 'train_runtime': '6246', 'train_tokens_per_second': '1977'} +{'loss': '1.261', 'grad_norm': '1.845', 'learning_rate': '4.999e-05', 'epoch': '0.1519', 'num_input_tokens_seen': 12349551, 'train_runtime': '6247', 'train_tokens_per_second': '1977'} +{'loss': '0.706', 'grad_norm': '1.431', 'learning_rate': '4.999e-05', 'epoch': '0.1519', 'num_input_tokens_seen': 12351598, 'train_runtime': '6248', 'train_tokens_per_second': '1977'} +{'loss': '0.9867', 'grad_norm': '1.847', 'learning_rate': '4.999e-05', 'epoch': '0.152', 'num_input_tokens_seen': 12353645, 'train_runtime': '6249', 'train_tokens_per_second': '1977'} +{'loss': '0.2791', 'grad_norm': '1.068', 'learning_rate': '4.999e-05', 'epoch': '0.152', 'num_input_tokens_seen': 12355692, 'train_runtime': '6250', 'train_tokens_per_second': '1977'} +{'loss': '0.5482', 'grad_norm': '1.398', 'learning_rate': '4.999e-05', 'epoch': '0.152', 'num_input_tokens_seen': 12357739, 'train_runtime': '6251', 'train_tokens_per_second': '1977'} +{'loss': '0.404', 'grad_norm': '0.8956', 'learning_rate': '4.999e-05', 'epoch': '0.152', 'num_input_tokens_seen': 12359786, 'train_runtime': '6252', 'train_tokens_per_second': '1977'} +{'loss': '0.7742', 'grad_norm': '1.205', 'learning_rate': '4.999e-05', 'epoch': '0.1521', 'num_input_tokens_seen': 12361833, 'train_runtime': '6253', 'train_tokens_per_second': '1977'} +{'loss': '1.778', 'grad_norm': '2.867', 'learning_rate': '4.999e-05', 'epoch': '0.1521', 'num_input_tokens_seen': 12363880, 'train_runtime': '6254', 'train_tokens_per_second': '1977'} +{'loss': '0.3279', 'grad_norm': '1.055', 'learning_rate': '4.999e-05', 'epoch': '0.1521', 'num_input_tokens_seen': 12365927, 'train_runtime': '6255', 'train_tokens_per_second': '1977'} +{'loss': '0.2574', 'grad_norm': '0.9298', 'learning_rate': '4.999e-05', 'epoch': '0.1521', 'num_input_tokens_seen': 12367974, 'train_runtime': '6256', 'train_tokens_per_second': '1977'} +{'loss': '0.3951', 'grad_norm': '1.031', 'learning_rate': '4.999e-05', 'epoch': '0.1522', 'num_input_tokens_seen': 12370021, 'train_runtime': '6257', 'train_tokens_per_second': '1977'} +{'loss': '0.8991', 'grad_norm': '1.572', 'learning_rate': '4.999e-05', 'epoch': '0.1522', 'num_input_tokens_seen': 12372068, 'train_runtime': '6258', 'train_tokens_per_second': '1977'} +{'loss': '0.4477', 'grad_norm': '0.8649', 'learning_rate': '4.999e-05', 'epoch': '0.1522', 'num_input_tokens_seen': 12374115, 'train_runtime': '6259', 'train_tokens_per_second': '1977'} +{'loss': '0.9198', 'grad_norm': '1.341', 'learning_rate': '4.999e-05', 'epoch': '0.1522', 'num_input_tokens_seen': 12376162, 'train_runtime': '6261', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '1.063', 'learning_rate': '4.999e-05', 'epoch': '0.1523', 'num_input_tokens_seen': 12378209, 'train_runtime': '6262', 'train_tokens_per_second': '1977'} +{'loss': '0.7667', 'grad_norm': '1.814', 'learning_rate': '4.999e-05', 'epoch': '0.1523', 'num_input_tokens_seen': 12380256, 'train_runtime': '6263', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '1.692', 'learning_rate': '4.999e-05', 'epoch': '0.1523', 'num_input_tokens_seen': 12382303, 'train_runtime': '6264', 'train_tokens_per_second': '1977'} +{'loss': '1.229', 'grad_norm': '2.209', 'learning_rate': '4.999e-05', 'epoch': '0.1523', 'num_input_tokens_seen': 12384350, 'train_runtime': '6265', 'train_tokens_per_second': '1977'} +{'loss': '1.361', 'grad_norm': '1.932', 'learning_rate': '4.999e-05', 'epoch': '0.1524', 'num_input_tokens_seen': 12386397, 'train_runtime': '6266', 'train_tokens_per_second': '1977'} +{'loss': '1.955', 'grad_norm': '4.027', 'learning_rate': '4.999e-05', 'epoch': '0.1524', 'num_input_tokens_seen': 12388444, 'train_runtime': '6267', 'train_tokens_per_second': '1977'} +{'loss': '0.8153', 'grad_norm': '1.459', 'learning_rate': '4.999e-05', 'epoch': '0.1524', 'num_input_tokens_seen': 12390491, 'train_runtime': '6268', 'train_tokens_per_second': '1977'} +{'loss': '0.7938', 'grad_norm': '1.373', 'learning_rate': '4.999e-05', 'epoch': '0.1524', 'num_input_tokens_seen': 12392538, 'train_runtime': '6269', 'train_tokens_per_second': '1977'} +{'loss': '1.503', 'grad_norm': '1.958', 'learning_rate': '4.999e-05', 'epoch': '0.1525', 'num_input_tokens_seen': 12394585, 'train_runtime': '6270', 'train_tokens_per_second': '1977'} +{'loss': '0.9585', 'grad_norm': '1.442', 'learning_rate': '4.999e-05', 'epoch': '0.1525', 'num_input_tokens_seen': 12396632, 'train_runtime': '6271', 'train_tokens_per_second': '1977'} +{'loss': '0.6564', 'grad_norm': '1.08', 'learning_rate': '4.999e-05', 'epoch': '0.1525', 'num_input_tokens_seen': 12398679, 'train_runtime': '6272', 'train_tokens_per_second': '1977'} +{'loss': '2.799', 'grad_norm': '2.543', 'learning_rate': '4.999e-05', 'epoch': '0.1525', 'num_input_tokens_seen': 12400726, 'train_runtime': '6273', 'train_tokens_per_second': '1977'} +{'loss': '3.154', 'grad_norm': '2.616', 'learning_rate': '4.999e-05', 'epoch': '0.1526', 'num_input_tokens_seen': 12402773, 'train_runtime': '6274', 'train_tokens_per_second': '1977'} +{'loss': '0.822', 'grad_norm': '1.538', 'learning_rate': '4.999e-05', 'epoch': '0.1526', 'num_input_tokens_seen': 12404820, 'train_runtime': '6275', 'train_tokens_per_second': '1977'} +{'loss': '0.6925', 'grad_norm': '1.455', 'learning_rate': '4.999e-05', 'epoch': '0.1526', 'num_input_tokens_seen': 12406867, 'train_runtime': '6276', 'train_tokens_per_second': '1977'} +{'loss': '0.9298', 'grad_norm': '1.315', 'learning_rate': '4.999e-05', 'epoch': '0.1526', 'num_input_tokens_seen': 12408914, 'train_runtime': '6277', 'train_tokens_per_second': '1977'} +{'loss': '1.707', 'grad_norm': '1.986', 'learning_rate': '4.999e-05', 'epoch': '0.1527', 'num_input_tokens_seen': 12410961, 'train_runtime': '6278', 'train_tokens_per_second': '1977'} +{'loss': '0.4921', 'grad_norm': '1.217', 'learning_rate': '4.999e-05', 'epoch': '0.1527', 'num_input_tokens_seen': 12413008, 'train_runtime': '6279', 'train_tokens_per_second': '1977'} +{'loss': '1.055', 'grad_norm': '1.604', 'learning_rate': '4.999e-05', 'epoch': '0.1527', 'num_input_tokens_seen': 12415055, 'train_runtime': '6280', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '1.715', 'learning_rate': '4.999e-05', 'epoch': '0.1527', 'num_input_tokens_seen': 12417102, 'train_runtime': '6281', 'train_tokens_per_second': '1977'} +{'loss': '0.5722', 'grad_norm': '1.255', 'learning_rate': '4.999e-05', 'epoch': '0.1528', 'num_input_tokens_seen': 12419149, 'train_runtime': '6282', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '1.355', 'learning_rate': '4.999e-05', 'epoch': '0.1528', 'num_input_tokens_seen': 12421196, 'train_runtime': '6283', 'train_tokens_per_second': '1977'} +{'loss': '0.7481', 'grad_norm': '1.235', 'learning_rate': '4.999e-05', 'epoch': '0.1528', 'num_input_tokens_seen': 12423243, 'train_runtime': '6284', 'train_tokens_per_second': '1977'} +{'loss': '0.5585', 'grad_norm': '1.31', 'learning_rate': '4.999e-05', 'epoch': '0.1528', 'num_input_tokens_seen': 12425290, 'train_runtime': '6285', 'train_tokens_per_second': '1977'} +{'loss': '0.8418', 'grad_norm': '1.425', 'learning_rate': '4.999e-05', 'epoch': '0.1529', 'num_input_tokens_seen': 12427337, 'train_runtime': '6286', 'train_tokens_per_second': '1977'} +{'loss': '0.5837', 'grad_norm': '1.42', 'learning_rate': '4.999e-05', 'epoch': '0.1529', 'num_input_tokens_seen': 12429384, 'train_runtime': '6287', 'train_tokens_per_second': '1977'} +{'loss': '0.5615', 'grad_norm': '1.183', 'learning_rate': '4.999e-05', 'epoch': '0.1529', 'num_input_tokens_seen': 12431431, 'train_runtime': '6288', 'train_tokens_per_second': '1977'} +{'loss': '0.8281', 'grad_norm': '1.844', 'learning_rate': '4.999e-05', 'epoch': '0.1529', 'num_input_tokens_seen': 12433478, 'train_runtime': '6289', 'train_tokens_per_second': '1977'} +{'loss': '0.8822', 'grad_norm': '1.365', 'learning_rate': '4.999e-05', 'epoch': '0.153', 'num_input_tokens_seen': 12435525, 'train_runtime': '6291', 'train_tokens_per_second': '1977'} +{'loss': '0.3762', 'grad_norm': '0.9344', 'learning_rate': '4.999e-05', 'epoch': '0.153', 'num_input_tokens_seen': 12437572, 'train_runtime': '6292', 'train_tokens_per_second': '1977'} +{'loss': '0.6569', 'grad_norm': '1.56', 'learning_rate': '4.999e-05', 'epoch': '0.153', 'num_input_tokens_seen': 12439619, 'train_runtime': '6293', 'train_tokens_per_second': '1977'} +{'loss': '0.9679', 'grad_norm': '1.514', 'learning_rate': '4.999e-05', 'epoch': '0.153', 'num_input_tokens_seen': 12441666, 'train_runtime': '6294', 'train_tokens_per_second': '1977'} +{'loss': '1.024', 'grad_norm': '1.478', 'learning_rate': '4.999e-05', 'epoch': '0.1531', 'num_input_tokens_seen': 12443713, 'train_runtime': '6295', 'train_tokens_per_second': '1977'} +{'loss': '0.7309', 'grad_norm': '1.61', 'learning_rate': '4.999e-05', 'epoch': '0.1531', 'num_input_tokens_seen': 12445760, 'train_runtime': '6296', 'train_tokens_per_second': '1977'} +{'loss': '0.3484', 'grad_norm': '0.9925', 'learning_rate': '4.999e-05', 'epoch': '0.1531', 'num_input_tokens_seen': 12447807, 'train_runtime': '6297', 'train_tokens_per_second': '1977'} +{'loss': '1.064', 'grad_norm': '1.719', 'learning_rate': '4.999e-05', 'epoch': '0.1531', 'num_input_tokens_seen': 12449854, 'train_runtime': '6298', 'train_tokens_per_second': '1977'} +{'loss': '1.246', 'grad_norm': '1.904', 'learning_rate': '4.999e-05', 'epoch': '0.1532', 'num_input_tokens_seen': 12451901, 'train_runtime': '6299', 'train_tokens_per_second': '1977'} +{'loss': '1.417', 'grad_norm': '2.467', 'learning_rate': '4.999e-05', 'epoch': '0.1532', 'num_input_tokens_seen': 12453948, 'train_runtime': '6300', 'train_tokens_per_second': '1977'} +{'loss': '1.277', 'grad_norm': '2.358', 'learning_rate': '4.999e-05', 'epoch': '0.1532', 'num_input_tokens_seen': 12455995, 'train_runtime': '6301', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.453', 'learning_rate': '4.999e-05', 'epoch': '0.1532', 'num_input_tokens_seen': 12458042, 'train_runtime': '6302', 'train_tokens_per_second': '1977'} +{'loss': '1.726', 'grad_norm': '2.577', 'learning_rate': '4.999e-05', 'epoch': '0.1533', 'num_input_tokens_seen': 12460089, 'train_runtime': '6303', 'train_tokens_per_second': '1977'} +{'loss': '0.6987', 'grad_norm': '1.28', 'learning_rate': '4.999e-05', 'epoch': '0.1533', 'num_input_tokens_seen': 12462136, 'train_runtime': '6304', 'train_tokens_per_second': '1977'} +{'loss': '0.495', 'grad_norm': '1.274', 'learning_rate': '4.999e-05', 'epoch': '0.1533', 'num_input_tokens_seen': 12464183, 'train_runtime': '6305', 'train_tokens_per_second': '1977'} +{'loss': '0.5496', 'grad_norm': '1.125', 'learning_rate': '4.999e-05', 'epoch': '0.1533', 'num_input_tokens_seen': 12466230, 'train_runtime': '6306', 'train_tokens_per_second': '1977'} +{'loss': '0.7063', 'grad_norm': '1.23', 'learning_rate': '4.999e-05', 'epoch': '0.1534', 'num_input_tokens_seen': 12468277, 'train_runtime': '6307', 'train_tokens_per_second': '1977'} +{'loss': '0.3079', 'grad_norm': '1.249', 'learning_rate': '4.999e-05', 'epoch': '0.1534', 'num_input_tokens_seen': 12470324, 'train_runtime': '6308', 'train_tokens_per_second': '1977'} +{'loss': '0.6669', 'grad_norm': '1.345', 'learning_rate': '4.999e-05', 'epoch': '0.1534', 'num_input_tokens_seen': 12472371, 'train_runtime': '6309', 'train_tokens_per_second': '1977'} +{'loss': '0.4533', 'grad_norm': '1.072', 'learning_rate': '4.999e-05', 'epoch': '0.1534', 'num_input_tokens_seen': 12474418, 'train_runtime': '6310', 'train_tokens_per_second': '1977'} +{'loss': '0.3744', 'grad_norm': '1.035', 'learning_rate': '4.999e-05', 'epoch': '0.1535', 'num_input_tokens_seen': 12476465, 'train_runtime': '6311', 'train_tokens_per_second': '1977'} +{'loss': '0.8862', 'grad_norm': '1.171', 'learning_rate': '4.999e-05', 'epoch': '0.1535', 'num_input_tokens_seen': 12478512, 'train_runtime': '6312', 'train_tokens_per_second': '1977'} +{'loss': '0.8102', 'grad_norm': '1.308', 'learning_rate': '4.999e-05', 'epoch': '0.1535', 'num_input_tokens_seen': 12480559, 'train_runtime': '6313', 'train_tokens_per_second': '1977'} +{'loss': '0.6806', 'grad_norm': '1.527', 'learning_rate': '4.999e-05', 'epoch': '0.1535', 'num_input_tokens_seen': 12482606, 'train_runtime': '6314', 'train_tokens_per_second': '1977'} +{'loss': '0.9045', 'grad_norm': '1.381', 'learning_rate': '4.999e-05', 'epoch': '0.1536', 'num_input_tokens_seen': 12484653, 'train_runtime': '6315', 'train_tokens_per_second': '1977'} +{'loss': '0.3231', 'grad_norm': '1.169', 'learning_rate': '4.999e-05', 'epoch': '0.1536', 'num_input_tokens_seen': 12486700, 'train_runtime': '6316', 'train_tokens_per_second': '1977'} +{'loss': '1.144', 'grad_norm': '1.569', 'learning_rate': '4.999e-05', 'epoch': '0.1536', 'num_input_tokens_seen': 12488747, 'train_runtime': '6317', 'train_tokens_per_second': '1977'} +{'loss': '1.29', 'grad_norm': '2.036', 'learning_rate': '4.999e-05', 'epoch': '0.1536', 'num_input_tokens_seen': 12490794, 'train_runtime': '6318', 'train_tokens_per_second': '1977'} +{'loss': '0.3845', 'grad_norm': '0.8312', 'learning_rate': '4.999e-05', 'epoch': '0.1537', 'num_input_tokens_seen': 12492841, 'train_runtime': '6319', 'train_tokens_per_second': '1977'} +{'loss': '0.9922', 'grad_norm': '1.765', 'learning_rate': '4.999e-05', 'epoch': '0.1537', 'num_input_tokens_seen': 12494888, 'train_runtime': '6321', 'train_tokens_per_second': '1977'} +{'loss': '0.646', 'grad_norm': '1.335', 'learning_rate': '4.999e-05', 'epoch': '0.1537', 'num_input_tokens_seen': 12496935, 'train_runtime': '6322', 'train_tokens_per_second': '1977'} +{'loss': '0.7603', 'grad_norm': '1.369', 'learning_rate': '4.999e-05', 'epoch': '0.1537', 'num_input_tokens_seen': 12498982, 'train_runtime': '6323', 'train_tokens_per_second': '1977'} +{'loss': '0.858', 'grad_norm': '1.543', 'learning_rate': '4.999e-05', 'epoch': '0.1538', 'num_input_tokens_seen': 12501029, 'train_runtime': '6324', 'train_tokens_per_second': '1977'} +{'loss': '0.512', 'grad_norm': '1.457', 'learning_rate': '4.999e-05', 'epoch': '0.1538', 'num_input_tokens_seen': 12503076, 'train_runtime': '6325', 'train_tokens_per_second': '1977'} +{'loss': '0.6763', 'grad_norm': '1.438', 'learning_rate': '4.999e-05', 'epoch': '0.1538', 'num_input_tokens_seen': 12505123, 'train_runtime': '6326', 'train_tokens_per_second': '1977'} +{'loss': '1.164', 'grad_norm': '1.475', 'learning_rate': '4.999e-05', 'epoch': '0.1538', 'num_input_tokens_seen': 12507170, 'train_runtime': '6327', 'train_tokens_per_second': '1977'} +{'loss': '0.4549', 'grad_norm': '1.128', 'learning_rate': '4.999e-05', 'epoch': '0.1539', 'num_input_tokens_seen': 12509217, 'train_runtime': '6328', 'train_tokens_per_second': '1977'} +{'loss': '0.8096', 'grad_norm': '2.23', 'learning_rate': '4.999e-05', 'epoch': '0.1539', 'num_input_tokens_seen': 12511264, 'train_runtime': '6329', 'train_tokens_per_second': '1977'} +{'loss': '0.8969', 'grad_norm': '2.134', 'learning_rate': '4.999e-05', 'epoch': '0.1539', 'num_input_tokens_seen': 12513311, 'train_runtime': '6330', 'train_tokens_per_second': '1977'} +{'loss': '0.862', 'grad_norm': '1.596', 'learning_rate': '4.999e-05', 'epoch': '0.1539', 'num_input_tokens_seen': 12515358, 'train_runtime': '6331', 'train_tokens_per_second': '1977'} +{'loss': '1.707', 'grad_norm': '1.989', 'learning_rate': '4.999e-05', 'epoch': '0.154', 'num_input_tokens_seen': 12517405, 'train_runtime': '6332', 'train_tokens_per_second': '1977'} +{'loss': '0.6023', 'grad_norm': '1.318', 'learning_rate': '4.999e-05', 'epoch': '0.154', 'num_input_tokens_seen': 12519452, 'train_runtime': '6333', 'train_tokens_per_second': '1977'} +{'loss': '0.5444', 'grad_norm': '1.286', 'learning_rate': '4.999e-05', 'epoch': '0.154', 'num_input_tokens_seen': 12521499, 'train_runtime': '6334', 'train_tokens_per_second': '1977'} +{'loss': '0.7424', 'grad_norm': '1.508', 'learning_rate': '4.999e-05', 'epoch': '0.154', 'num_input_tokens_seen': 12523546, 'train_runtime': '6335', 'train_tokens_per_second': '1977'} +{'loss': '0.57', 'grad_norm': '1.4', 'learning_rate': '4.999e-05', 'epoch': '0.1541', 'num_input_tokens_seen': 12525593, 'train_runtime': '6336', 'train_tokens_per_second': '1977'} +{'loss': '0.9597', 'grad_norm': '1.3', 'learning_rate': '4.998e-05', 'epoch': '0.1541', 'num_input_tokens_seen': 12527640, 'train_runtime': '6337', 'train_tokens_per_second': '1977'} +{'loss': '0.8614', 'grad_norm': '1.485', 'learning_rate': '4.998e-05', 'epoch': '0.1541', 'num_input_tokens_seen': 12529687, 'train_runtime': '6338', 'train_tokens_per_second': '1977'} +{'loss': '0.3676', 'grad_norm': '0.9851', 'learning_rate': '4.998e-05', 'epoch': '0.1541', 'num_input_tokens_seen': 12531734, 'train_runtime': '6339', 'train_tokens_per_second': '1977'} +{'loss': '0.4798', 'grad_norm': '1.413', 'learning_rate': '4.998e-05', 'epoch': '0.1542', 'num_input_tokens_seen': 12533781, 'train_runtime': '6340', 'train_tokens_per_second': '1977'} +{'loss': '0.379', 'grad_norm': '1.068', 'learning_rate': '4.998e-05', 'epoch': '0.1542', 'num_input_tokens_seen': 12535828, 'train_runtime': '6341', 'train_tokens_per_second': '1977'} +{'loss': '0.3544', 'grad_norm': '1.067', 'learning_rate': '4.998e-05', 'epoch': '0.1542', 'num_input_tokens_seen': 12537875, 'train_runtime': '6342', 'train_tokens_per_second': '1977'} +{'loss': '0.6274', 'grad_norm': '1.063', 'learning_rate': '4.998e-05', 'epoch': '0.1542', 'num_input_tokens_seen': 12539922, 'train_runtime': '6343', 'train_tokens_per_second': '1977'} +{'loss': '0.7721', 'grad_norm': '1.692', 'learning_rate': '4.998e-05', 'epoch': '0.1543', 'num_input_tokens_seen': 12541969, 'train_runtime': '6344', 'train_tokens_per_second': '1977'} +{'loss': '0.3005', 'grad_norm': '0.7962', 'learning_rate': '4.998e-05', 'epoch': '0.1543', 'num_input_tokens_seen': 12544016, 'train_runtime': '6345', 'train_tokens_per_second': '1977'} +{'loss': '0.8239', 'grad_norm': '1.483', 'learning_rate': '4.998e-05', 'epoch': '0.1543', 'num_input_tokens_seen': 12546063, 'train_runtime': '6346', 'train_tokens_per_second': '1977'} +{'loss': '1.504', 'grad_norm': '2.498', 'learning_rate': '4.998e-05', 'epoch': '0.1543', 'num_input_tokens_seen': 12548110, 'train_runtime': '6347', 'train_tokens_per_second': '1977'} +{'loss': '0.6152', 'grad_norm': '1.549', 'learning_rate': '4.998e-05', 'epoch': '0.1544', 'num_input_tokens_seen': 12550157, 'train_runtime': '6348', 'train_tokens_per_second': '1977'} +{'loss': '1.116', 'grad_norm': '4.168', 'learning_rate': '4.998e-05', 'epoch': '0.1544', 'num_input_tokens_seen': 12552204, 'train_runtime': '6349', 'train_tokens_per_second': '1977'} +{'loss': '0.4468', 'grad_norm': '1.118', 'learning_rate': '4.998e-05', 'epoch': '0.1544', 'num_input_tokens_seen': 12554251, 'train_runtime': '6351', 'train_tokens_per_second': '1977'} +{'loss': '0.3933', 'grad_norm': '1.152', 'learning_rate': '4.998e-05', 'epoch': '0.1544', 'num_input_tokens_seen': 12556298, 'train_runtime': '6352', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '2.105', 'learning_rate': '4.998e-05', 'epoch': '0.1545', 'num_input_tokens_seen': 12558345, 'train_runtime': '6353', 'train_tokens_per_second': '1977'} +{'loss': '0.7267', 'grad_norm': '1.953', 'learning_rate': '4.998e-05', 'epoch': '0.1545', 'num_input_tokens_seen': 12560392, 'train_runtime': '6354', 'train_tokens_per_second': '1977'} +{'loss': '0.32', 'grad_norm': '1.035', 'learning_rate': '4.998e-05', 'epoch': '0.1545', 'num_input_tokens_seen': 12562439, 'train_runtime': '6355', 'train_tokens_per_second': '1977'} +{'loss': '0.8991', 'grad_norm': '1.389', 'learning_rate': '4.998e-05', 'epoch': '0.1545', 'num_input_tokens_seen': 12564486, 'train_runtime': '6356', 'train_tokens_per_second': '1977'} +{'loss': '0.6138', 'grad_norm': '1.389', 'learning_rate': '4.998e-05', 'epoch': '0.1546', 'num_input_tokens_seen': 12566533, 'train_runtime': '6357', 'train_tokens_per_second': '1977'} +{'loss': '0.6236', 'grad_norm': '1.234', 'learning_rate': '4.998e-05', 'epoch': '0.1546', 'num_input_tokens_seen': 12568580, 'train_runtime': '6358', 'train_tokens_per_second': '1977'} +{'loss': '0.9041', 'grad_norm': '1.719', 'learning_rate': '4.998e-05', 'epoch': '0.1546', 'num_input_tokens_seen': 12570627, 'train_runtime': '6359', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '1.329', 'learning_rate': '4.998e-05', 'epoch': '0.1546', 'num_input_tokens_seen': 12572674, 'train_runtime': '6360', 'train_tokens_per_second': '1977'} +{'loss': '1.096', 'grad_norm': '1.628', 'learning_rate': '4.998e-05', 'epoch': '0.1547', 'num_input_tokens_seen': 12574721, 'train_runtime': '6361', 'train_tokens_per_second': '1977'} +{'loss': '1.379', 'grad_norm': '1.785', 'learning_rate': '4.998e-05', 'epoch': '0.1547', 'num_input_tokens_seen': 12576768, 'train_runtime': '6362', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '1.42', 'learning_rate': '4.998e-05', 'epoch': '0.1547', 'num_input_tokens_seen': 12578815, 'train_runtime': '6363', 'train_tokens_per_second': '1977'} +{'loss': '0.3755', 'grad_norm': '1.068', 'learning_rate': '4.998e-05', 'epoch': '0.1547', 'num_input_tokens_seen': 12580862, 'train_runtime': '6364', 'train_tokens_per_second': '1977'} +{'loss': '1.409', 'grad_norm': '1.816', 'learning_rate': '4.998e-05', 'epoch': '0.1548', 'num_input_tokens_seen': 12582909, 'train_runtime': '6365', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.631', 'learning_rate': '4.998e-05', 'epoch': '0.1548', 'num_input_tokens_seen': 12584956, 'train_runtime': '6366', 'train_tokens_per_second': '1977'} +{'loss': '0.2931', 'grad_norm': '0.9328', 'learning_rate': '4.998e-05', 'epoch': '0.1548', 'num_input_tokens_seen': 12587003, 'train_runtime': '6367', 'train_tokens_per_second': '1977'} +{'loss': '0.8694', 'grad_norm': '1.419', 'learning_rate': '4.998e-05', 'epoch': '0.1548', 'num_input_tokens_seen': 12589050, 'train_runtime': '6368', 'train_tokens_per_second': '1977'} +{'loss': '1.692', 'grad_norm': '2.117', 'learning_rate': '4.998e-05', 'epoch': '0.1549', 'num_input_tokens_seen': 12591097, 'train_runtime': '6369', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '1.799', 'learning_rate': '4.998e-05', 'epoch': '0.1549', 'num_input_tokens_seen': 12593144, 'train_runtime': '6370', 'train_tokens_per_second': '1977'} +{'loss': '0.6099', 'grad_norm': '1.474', 'learning_rate': '4.998e-05', 'epoch': '0.1549', 'num_input_tokens_seen': 12595191, 'train_runtime': '6371', 'train_tokens_per_second': '1977'} +{'loss': '1.37', 'grad_norm': '2.044', 'learning_rate': '4.998e-05', 'epoch': '0.1549', 'num_input_tokens_seen': 12597238, 'train_runtime': '6372', 'train_tokens_per_second': '1977'} +{'loss': '0.9123', 'grad_norm': '2.017', 'learning_rate': '4.998e-05', 'epoch': '0.155', 'num_input_tokens_seen': 12599285, 'train_runtime': '6373', 'train_tokens_per_second': '1977'} +{'loss': '0.9463', 'grad_norm': '1.412', 'learning_rate': '4.998e-05', 'epoch': '0.155', 'num_input_tokens_seen': 12601332, 'train_runtime': '6374', 'train_tokens_per_second': '1977'} +{'loss': '1.693', 'grad_norm': '2.907', 'learning_rate': '4.998e-05', 'epoch': '0.155', 'num_input_tokens_seen': 12603379, 'train_runtime': '6375', 'train_tokens_per_second': '1977'} +{'loss': '1.016', 'grad_norm': '1.631', 'learning_rate': '4.998e-05', 'epoch': '0.155', 'num_input_tokens_seen': 12605426, 'train_runtime': '6376', 'train_tokens_per_second': '1977'} +{'loss': '0.9231', 'grad_norm': '1.71', 'learning_rate': '4.998e-05', 'epoch': '0.1551', 'num_input_tokens_seen': 12607473, 'train_runtime': '6377', 'train_tokens_per_second': '1977'} +{'loss': '1.021', 'grad_norm': '1.512', 'learning_rate': '4.998e-05', 'epoch': '0.1551', 'num_input_tokens_seen': 12609520, 'train_runtime': '6378', 'train_tokens_per_second': '1977'} +{'loss': '1.159', 'grad_norm': '1.945', 'learning_rate': '4.998e-05', 'epoch': '0.1551', 'num_input_tokens_seen': 12611567, 'train_runtime': '6379', 'train_tokens_per_second': '1977'} +{'loss': '2.233', 'grad_norm': '2.525', 'learning_rate': '4.998e-05', 'epoch': '0.1551', 'num_input_tokens_seen': 12613614, 'train_runtime': '6381', 'train_tokens_per_second': '1977'} +{'loss': '0.6617', 'grad_norm': '0.9537', 'learning_rate': '4.998e-05', 'epoch': '0.1552', 'num_input_tokens_seen': 12615661, 'train_runtime': '6382', 'train_tokens_per_second': '1977'} +{'loss': '1.798', 'grad_norm': '2.623', 'learning_rate': '4.998e-05', 'epoch': '0.1552', 'num_input_tokens_seen': 12617708, 'train_runtime': '6383', 'train_tokens_per_second': '1977'} +{'loss': '2.379', 'grad_norm': '3.044', 'learning_rate': '4.998e-05', 'epoch': '0.1552', 'num_input_tokens_seen': 12619755, 'train_runtime': '6384', 'train_tokens_per_second': '1977'} +{'loss': '0.3705', 'grad_norm': '1.171', 'learning_rate': '4.998e-05', 'epoch': '0.1552', 'num_input_tokens_seen': 12621802, 'train_runtime': '6385', 'train_tokens_per_second': '1977'} +{'loss': '0.6998', 'grad_norm': '1.642', 'learning_rate': '4.998e-05', 'epoch': '0.1553', 'num_input_tokens_seen': 12623849, 'train_runtime': '6386', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '1.585', 'learning_rate': '4.998e-05', 'epoch': '0.1553', 'num_input_tokens_seen': 12625896, 'train_runtime': '6387', 'train_tokens_per_second': '1977'} +{'loss': '0.968', 'grad_norm': '1.267', 'learning_rate': '4.998e-05', 'epoch': '0.1553', 'num_input_tokens_seen': 12627943, 'train_runtime': '6388', 'train_tokens_per_second': '1977'} +{'loss': '0.6736', 'grad_norm': '0.9702', 'learning_rate': '4.998e-05', 'epoch': '0.1553', 'num_input_tokens_seen': 12629990, 'train_runtime': '6389', 'train_tokens_per_second': '1977'} +{'loss': '1.869', 'grad_norm': '2.342', 'learning_rate': '4.998e-05', 'epoch': '0.1554', 'num_input_tokens_seen': 12632037, 'train_runtime': '6390', 'train_tokens_per_second': '1977'} +{'loss': '0.4108', 'grad_norm': '1.093', 'learning_rate': '4.998e-05', 'epoch': '0.1554', 'num_input_tokens_seen': 12634084, 'train_runtime': '6391', 'train_tokens_per_second': '1977'} +{'loss': '0.3024', 'grad_norm': '0.8963', 'learning_rate': '4.998e-05', 'epoch': '0.1554', 'num_input_tokens_seen': 12636131, 'train_runtime': '6392', 'train_tokens_per_second': '1977'} +{'loss': '0.4489', 'grad_norm': '1.262', 'learning_rate': '4.998e-05', 'epoch': '0.1554', 'num_input_tokens_seen': 12638178, 'train_runtime': '6393', 'train_tokens_per_second': '1977'} +{'loss': '0.9743', 'grad_norm': '1.675', 'learning_rate': '4.998e-05', 'epoch': '0.1555', 'num_input_tokens_seen': 12640225, 'train_runtime': '6394', 'train_tokens_per_second': '1977'} +{'loss': '0.8463', 'grad_norm': '1.46', 'learning_rate': '4.998e-05', 'epoch': '0.1555', 'num_input_tokens_seen': 12642272, 'train_runtime': '6395', 'train_tokens_per_second': '1977'} +{'loss': '0.6433', 'grad_norm': '1.284', 'learning_rate': '4.998e-05', 'epoch': '0.1555', 'num_input_tokens_seen': 12644319, 'train_runtime': '6396', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '1.523', 'learning_rate': '4.998e-05', 'epoch': '0.1556', 'num_input_tokens_seen': 12646366, 'train_runtime': '6397', 'train_tokens_per_second': '1977'} +{'loss': '0.2826', 'grad_norm': '0.9578', 'learning_rate': '4.998e-05', 'epoch': '0.1556', 'num_input_tokens_seen': 12648413, 'train_runtime': '6398', 'train_tokens_per_second': '1977'} +{'loss': '0.8663', 'grad_norm': '1.74', 'learning_rate': '4.998e-05', 'epoch': '0.1556', 'num_input_tokens_seen': 12650460, 'train_runtime': '6399', 'train_tokens_per_second': '1977'} +{'loss': '0.6543', 'grad_norm': '1.327', 'learning_rate': '4.998e-05', 'epoch': '0.1556', 'num_input_tokens_seen': 12652507, 'train_runtime': '6400', 'train_tokens_per_second': '1977'} +{'loss': '0.7658', 'grad_norm': '1.415', 'learning_rate': '4.998e-05', 'epoch': '0.1557', 'num_input_tokens_seen': 12654554, 'train_runtime': '6401', 'train_tokens_per_second': '1977'} +{'loss': '0.4436', 'grad_norm': '1.225', 'learning_rate': '4.998e-05', 'epoch': '0.1557', 'num_input_tokens_seen': 12656601, 'train_runtime': '6402', 'train_tokens_per_second': '1977'} +{'loss': '0.8724', 'grad_norm': '1.457', 'learning_rate': '4.998e-05', 'epoch': '0.1557', 'num_input_tokens_seen': 12658648, 'train_runtime': '6403', 'train_tokens_per_second': '1977'} +{'loss': '0.6533', 'grad_norm': '1.262', 'learning_rate': '4.998e-05', 'epoch': '0.1557', 'num_input_tokens_seen': 12660695, 'train_runtime': '6404', 'train_tokens_per_second': '1977'} +{'loss': '0.9616', 'grad_norm': '1.384', 'learning_rate': '4.998e-05', 'epoch': '0.1558', 'num_input_tokens_seen': 12662742, 'train_runtime': '6405', 'train_tokens_per_second': '1977'} +{'loss': '1.267', 'grad_norm': '1.683', 'learning_rate': '4.998e-05', 'epoch': '0.1558', 'num_input_tokens_seen': 12664789, 'train_runtime': '6406', 'train_tokens_per_second': '1977'} +{'loss': '0.6645', 'grad_norm': '1.332', 'learning_rate': '4.998e-05', 'epoch': '0.1558', 'num_input_tokens_seen': 12666836, 'train_runtime': '6407', 'train_tokens_per_second': '1977'} +{'loss': '0.3147', 'grad_norm': '1.101', 'learning_rate': '4.998e-05', 'epoch': '0.1558', 'num_input_tokens_seen': 12668883, 'train_runtime': '6408', 'train_tokens_per_second': '1977'} +{'loss': '1.248', 'grad_norm': '2.261', 'learning_rate': '4.998e-05', 'epoch': '0.1559', 'num_input_tokens_seen': 12670930, 'train_runtime': '6409', 'train_tokens_per_second': '1977'} +{'loss': '0.4649', 'grad_norm': '1.02', 'learning_rate': '4.998e-05', 'epoch': '0.1559', 'num_input_tokens_seen': 12672977, 'train_runtime': '6411', 'train_tokens_per_second': '1977'} +{'loss': '0.2017', 'grad_norm': '0.8619', 'learning_rate': '4.998e-05', 'epoch': '0.1559', 'num_input_tokens_seen': 12675024, 'train_runtime': '6412', 'train_tokens_per_second': '1977'} +{'loss': '0.8271', 'grad_norm': '1.593', 'learning_rate': '4.998e-05', 'epoch': '0.1559', 'num_input_tokens_seen': 12677071, 'train_runtime': '6413', 'train_tokens_per_second': '1977'} +{'loss': '0.3923', 'grad_norm': '0.9536', 'learning_rate': '4.998e-05', 'epoch': '0.156', 'num_input_tokens_seen': 12679118, 'train_runtime': '6414', 'train_tokens_per_second': '1977'} +{'loss': '0.5838', 'grad_norm': '1.182', 'learning_rate': '4.998e-05', 'epoch': '0.156', 'num_input_tokens_seen': 12681165, 'train_runtime': '6415', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '1.629', 'learning_rate': '4.998e-05', 'epoch': '0.156', 'num_input_tokens_seen': 12683212, 'train_runtime': '6416', 'train_tokens_per_second': '1977'} +{'loss': '1.42', 'grad_norm': '1.881', 'learning_rate': '4.998e-05', 'epoch': '0.156', 'num_input_tokens_seen': 12685259, 'train_runtime': '6417', 'train_tokens_per_second': '1977'} +{'loss': '0.9676', 'grad_norm': '1.6', 'learning_rate': '4.998e-05', 'epoch': '0.1561', 'num_input_tokens_seen': 12687306, 'train_runtime': '6418', 'train_tokens_per_second': '1977'} +{'loss': '0.9254', 'grad_norm': '1.565', 'learning_rate': '4.998e-05', 'epoch': '0.1561', 'num_input_tokens_seen': 12689353, 'train_runtime': '6419', 'train_tokens_per_second': '1977'} +{'loss': '1.38', 'grad_norm': '2.06', 'learning_rate': '4.998e-05', 'epoch': '0.1561', 'num_input_tokens_seen': 12691400, 'train_runtime': '6420', 'train_tokens_per_second': '1977'} +{'loss': '0.9428', 'grad_norm': '1.493', 'learning_rate': '4.998e-05', 'epoch': '0.1561', 'num_input_tokens_seen': 12693447, 'train_runtime': '6421', 'train_tokens_per_second': '1977'} +{'loss': '0.5432', 'grad_norm': '1.244', 'learning_rate': '4.998e-05', 'epoch': '0.1562', 'num_input_tokens_seen': 12695494, 'train_runtime': '6422', 'train_tokens_per_second': '1977'} +{'loss': '1.152', 'grad_norm': '1.784', 'learning_rate': '4.998e-05', 'epoch': '0.1562', 'num_input_tokens_seen': 12697541, 'train_runtime': '6423', 'train_tokens_per_second': '1977'} +{'loss': '0.8233', 'grad_norm': '1.738', 'learning_rate': '4.998e-05', 'epoch': '0.1562', 'num_input_tokens_seen': 12699588, 'train_runtime': '6424', 'train_tokens_per_second': '1977'} +{'loss': '1.445', 'grad_norm': '1.706', 'learning_rate': '4.998e-05', 'epoch': '0.1562', 'num_input_tokens_seen': 12701635, 'train_runtime': '6425', 'train_tokens_per_second': '1977'} +{'loss': '1.448', 'grad_norm': '2.246', 'learning_rate': '4.998e-05', 'epoch': '0.1563', 'num_input_tokens_seen': 12703682, 'train_runtime': '6426', 'train_tokens_per_second': '1977'} +{'loss': '1.291', 'grad_norm': '1.872', 'learning_rate': '4.998e-05', 'epoch': '0.1563', 'num_input_tokens_seen': 12705729, 'train_runtime': '6427', 'train_tokens_per_second': '1977'} +{'loss': '0.4364', 'grad_norm': '1.212', 'learning_rate': '4.998e-05', 'epoch': '0.1563', 'num_input_tokens_seen': 12707776, 'train_runtime': '6428', 'train_tokens_per_second': '1977'} +{'loss': '1.282', 'grad_norm': '1.846', 'learning_rate': '4.998e-05', 'epoch': '0.1563', 'num_input_tokens_seen': 12709823, 'train_runtime': '6429', 'train_tokens_per_second': '1977'} +{'loss': '1.17', 'grad_norm': '1.546', 'learning_rate': '4.998e-05', 'epoch': '0.1564', 'num_input_tokens_seen': 12711870, 'train_runtime': '6430', 'train_tokens_per_second': '1977'} +{'loss': '0.828', 'grad_norm': '1.425', 'learning_rate': '4.998e-05', 'epoch': '0.1564', 'num_input_tokens_seen': 12713917, 'train_runtime': '6431', 'train_tokens_per_second': '1977'} +{'loss': '1.671', 'grad_norm': '2.217', 'learning_rate': '4.998e-05', 'epoch': '0.1564', 'num_input_tokens_seen': 12715964, 'train_runtime': '6432', 'train_tokens_per_second': '1977'} +{'loss': '1.336', 'grad_norm': '2.209', 'learning_rate': '4.998e-05', 'epoch': '0.1564', 'num_input_tokens_seen': 12718011, 'train_runtime': '6433', 'train_tokens_per_second': '1977'} +{'loss': '0.8108', 'grad_norm': '1.537', 'learning_rate': '4.998e-05', 'epoch': '0.1565', 'num_input_tokens_seen': 12720058, 'train_runtime': '6434', 'train_tokens_per_second': '1977'} +{'loss': '1.564', 'grad_norm': '1.833', 'learning_rate': '4.998e-05', 'epoch': '0.1565', 'num_input_tokens_seen': 12722105, 'train_runtime': '6435', 'train_tokens_per_second': '1977'} +{'loss': '0.5888', 'grad_norm': '1.361', 'learning_rate': '4.998e-05', 'epoch': '0.1565', 'num_input_tokens_seen': 12724152, 'train_runtime': '6436', 'train_tokens_per_second': '1977'} +{'loss': '0.6217', 'grad_norm': '1.52', 'learning_rate': '4.998e-05', 'epoch': '0.1565', 'num_input_tokens_seen': 12726199, 'train_runtime': '6437', 'train_tokens_per_second': '1977'} +{'loss': '0.4549', 'grad_norm': '1.106', 'learning_rate': '4.998e-05', 'epoch': '0.1566', 'num_input_tokens_seen': 12728246, 'train_runtime': '6438', 'train_tokens_per_second': '1977'} +{'loss': '0.6691', 'grad_norm': '1.659', 'learning_rate': '4.998e-05', 'epoch': '0.1566', 'num_input_tokens_seen': 12730293, 'train_runtime': '6439', 'train_tokens_per_second': '1977'} +{'loss': '0.7508', 'grad_norm': '1.304', 'learning_rate': '4.998e-05', 'epoch': '0.1566', 'num_input_tokens_seen': 12732340, 'train_runtime': '6441', 'train_tokens_per_second': '1977'} +{'loss': '0.6468', 'grad_norm': '1.534', 'learning_rate': '4.998e-05', 'epoch': '0.1566', 'num_input_tokens_seen': 12734387, 'train_runtime': '6442', 'train_tokens_per_second': '1977'} +{'loss': '0.8361', 'grad_norm': '1.535', 'learning_rate': '4.998e-05', 'epoch': '0.1567', 'num_input_tokens_seen': 12736434, 'train_runtime': '6443', 'train_tokens_per_second': '1977'} +{'loss': '0.616', 'grad_norm': '1.328', 'learning_rate': '4.998e-05', 'epoch': '0.1567', 'num_input_tokens_seen': 12738481, 'train_runtime': '6444', 'train_tokens_per_second': '1977'} +{'loss': '1.352', 'grad_norm': '1.88', 'learning_rate': '4.998e-05', 'epoch': '0.1567', 'num_input_tokens_seen': 12740528, 'train_runtime': '6445', 'train_tokens_per_second': '1977'} +{'loss': '0.4813', 'grad_norm': '1.074', 'learning_rate': '4.998e-05', 'epoch': '0.1567', 'num_input_tokens_seen': 12742575, 'train_runtime': '6446', 'train_tokens_per_second': '1977'} +{'loss': '0.5544', 'grad_norm': '1.029', 'learning_rate': '4.998e-05', 'epoch': '0.1568', 'num_input_tokens_seen': 12744622, 'train_runtime': '6447', 'train_tokens_per_second': '1977'} +{'loss': '0.8627', 'grad_norm': '1.355', 'learning_rate': '4.998e-05', 'epoch': '0.1568', 'num_input_tokens_seen': 12746669, 'train_runtime': '6448', 'train_tokens_per_second': '1977'} +{'loss': '0.7087', 'grad_norm': '1.341', 'learning_rate': '4.998e-05', 'epoch': '0.1568', 'num_input_tokens_seen': 12748716, 'train_runtime': '6449', 'train_tokens_per_second': '1977'} +{'loss': '0.7831', 'grad_norm': '1.355', 'learning_rate': '4.998e-05', 'epoch': '0.1568', 'num_input_tokens_seen': 12750763, 'train_runtime': '6450', 'train_tokens_per_second': '1977'} +{'loss': '0.5905', 'grad_norm': '1.103', 'learning_rate': '4.998e-05', 'epoch': '0.1569', 'num_input_tokens_seen': 12752810, 'train_runtime': '6451', 'train_tokens_per_second': '1977'} +{'loss': '0.3141', 'grad_norm': '1.133', 'learning_rate': '4.998e-05', 'epoch': '0.1569', 'num_input_tokens_seen': 12754857, 'train_runtime': '6452', 'train_tokens_per_second': '1977'} +{'loss': '0.7317', 'grad_norm': '1.23', 'learning_rate': '4.998e-05', 'epoch': '0.1569', 'num_input_tokens_seen': 12756904, 'train_runtime': '6453', 'train_tokens_per_second': '1977'} +{'loss': '0.4155', 'grad_norm': '1.04', 'learning_rate': '4.998e-05', 'epoch': '0.1569', 'num_input_tokens_seen': 12758951, 'train_runtime': '6454', 'train_tokens_per_second': '1977'} +{'loss': '0.3498', 'grad_norm': '1.153', 'learning_rate': '4.998e-05', 'epoch': '0.157', 'num_input_tokens_seen': 12760998, 'train_runtime': '6455', 'train_tokens_per_second': '1977'} +{'loss': '0.352', 'grad_norm': '1.351', 'learning_rate': '4.998e-05', 'epoch': '0.157', 'num_input_tokens_seen': 12763045, 'train_runtime': '6456', 'train_tokens_per_second': '1977'} +{'loss': '0.997', 'grad_norm': '1.61', 'learning_rate': '4.998e-05', 'epoch': '0.157', 'num_input_tokens_seen': 12765092, 'train_runtime': '6457', 'train_tokens_per_second': '1977'} +{'loss': '1.202', 'grad_norm': '1.993', 'learning_rate': '4.998e-05', 'epoch': '0.157', 'num_input_tokens_seen': 12767139, 'train_runtime': '6458', 'train_tokens_per_second': '1977'} +{'loss': '1.18', 'grad_norm': '1.614', 'learning_rate': '4.998e-05', 'epoch': '0.1571', 'num_input_tokens_seen': 12769186, 'train_runtime': '6459', 'train_tokens_per_second': '1977'} +{'loss': '0.721', 'grad_norm': '1.425', 'learning_rate': '4.998e-05', 'epoch': '0.1571', 'num_input_tokens_seen': 12771233, 'train_runtime': '6460', 'train_tokens_per_second': '1977'} +{'loss': '0.7178', 'grad_norm': '1.406', 'learning_rate': '4.998e-05', 'epoch': '0.1571', 'num_input_tokens_seen': 12773280, 'train_runtime': '6461', 'train_tokens_per_second': '1977'} +{'loss': '0.4347', 'grad_norm': '1.259', 'learning_rate': '4.998e-05', 'epoch': '0.1571', 'num_input_tokens_seen': 12775327, 'train_runtime': '6462', 'train_tokens_per_second': '1977'} +{'loss': '0.282', 'grad_norm': '0.8731', 'learning_rate': '4.998e-05', 'epoch': '0.1572', 'num_input_tokens_seen': 12777374, 'train_runtime': '6463', 'train_tokens_per_second': '1977'} +{'loss': '0.7744', 'grad_norm': '1.366', 'learning_rate': '4.998e-05', 'epoch': '0.1572', 'num_input_tokens_seen': 12779421, 'train_runtime': '6464', 'train_tokens_per_second': '1977'} +{'loss': '0.2845', 'grad_norm': '0.95', 'learning_rate': '4.998e-05', 'epoch': '0.1572', 'num_input_tokens_seen': 12781468, 'train_runtime': '6465', 'train_tokens_per_second': '1977'} +{'loss': '0.3785', 'grad_norm': '0.9295', 'learning_rate': '4.998e-05', 'epoch': '0.1572', 'num_input_tokens_seen': 12783515, 'train_runtime': '6466', 'train_tokens_per_second': '1977'} +{'loss': '1.043', 'grad_norm': '1.563', 'learning_rate': '4.998e-05', 'epoch': '0.1573', 'num_input_tokens_seen': 12785562, 'train_runtime': '6467', 'train_tokens_per_second': '1977'} +{'loss': '0.4109', 'grad_norm': '1.021', 'learning_rate': '4.998e-05', 'epoch': '0.1573', 'num_input_tokens_seen': 12787609, 'train_runtime': '6468', 'train_tokens_per_second': '1977'} +{'loss': '0.3066', 'grad_norm': '1.093', 'learning_rate': '4.998e-05', 'epoch': '0.1573', 'num_input_tokens_seen': 12789656, 'train_runtime': '6469', 'train_tokens_per_second': '1977'} +{'loss': '0.4962', 'grad_norm': '1.572', 'learning_rate': '4.998e-05', 'epoch': '0.1573', 'num_input_tokens_seen': 12791703, 'train_runtime': '6470', 'train_tokens_per_second': '1977'} +{'loss': '0.5483', 'grad_norm': '1.112', 'learning_rate': '4.998e-05', 'epoch': '0.1574', 'num_input_tokens_seen': 12793750, 'train_runtime': '6472', 'train_tokens_per_second': '1977'} +{'loss': '0.7116', 'grad_norm': '1.621', 'learning_rate': '4.998e-05', 'epoch': '0.1574', 'num_input_tokens_seen': 12795797, 'train_runtime': '6473', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.137', 'learning_rate': '4.998e-05', 'epoch': '0.1574', 'num_input_tokens_seen': 12797844, 'train_runtime': '6474', 'train_tokens_per_second': '1977'} +{'loss': '0.7667', 'grad_norm': '1.759', 'learning_rate': '4.998e-05', 'epoch': '0.1574', 'num_input_tokens_seen': 12799891, 'train_runtime': '6475', 'train_tokens_per_second': '1977'} +{'loss': '0.5875', 'grad_norm': '1.253', 'learning_rate': '4.998e-05', 'epoch': '0.1575', 'num_input_tokens_seen': 12801938, 'train_runtime': '6476', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '1.838', 'learning_rate': '4.998e-05', 'epoch': '0.1575', 'num_input_tokens_seen': 12803985, 'train_runtime': '6477', 'train_tokens_per_second': '1977'} +{'loss': '0.5987', 'grad_norm': '1.11', 'learning_rate': '4.998e-05', 'epoch': '0.1575', 'num_input_tokens_seen': 12806032, 'train_runtime': '6478', 'train_tokens_per_second': '1977'} +{'loss': '0.4489', 'grad_norm': '1.465', 'learning_rate': '4.998e-05', 'epoch': '0.1575', 'num_input_tokens_seen': 12808079, 'train_runtime': '6479', 'train_tokens_per_second': '1977'} +{'loss': '0.5989', 'grad_norm': '1.15', 'learning_rate': '4.998e-05', 'epoch': '0.1576', 'num_input_tokens_seen': 12810126, 'train_runtime': '6480', 'train_tokens_per_second': '1977'} +{'loss': '1.572', 'grad_norm': '3.832', 'learning_rate': '4.998e-05', 'epoch': '0.1576', 'num_input_tokens_seen': 12812173, 'train_runtime': '6481', 'train_tokens_per_second': '1977'} +{'loss': '2.3', 'grad_norm': '2.427', 'learning_rate': '4.998e-05', 'epoch': '0.1576', 'num_input_tokens_seen': 12814220, 'train_runtime': '6482', 'train_tokens_per_second': '1977'} +{'loss': '0.3294', 'grad_norm': '0.9189', 'learning_rate': '4.998e-05', 'epoch': '0.1576', 'num_input_tokens_seen': 12816267, 'train_runtime': '6483', 'train_tokens_per_second': '1977'} +{'loss': '1.565', 'grad_norm': '1.875', 'learning_rate': '4.998e-05', 'epoch': '0.1577', 'num_input_tokens_seen': 12818314, 'train_runtime': '6484', 'train_tokens_per_second': '1977'} +{'loss': '0.7282', 'grad_norm': '1.246', 'learning_rate': '4.998e-05', 'epoch': '0.1577', 'num_input_tokens_seen': 12820361, 'train_runtime': '6485', 'train_tokens_per_second': '1977'} +{'loss': '1.255', 'grad_norm': '1.868', 'learning_rate': '4.998e-05', 'epoch': '0.1577', 'num_input_tokens_seen': 12822408, 'train_runtime': '6486', 'train_tokens_per_second': '1977'} +{'loss': '1.404', 'grad_norm': '2.01', 'learning_rate': '4.998e-05', 'epoch': '0.1577', 'num_input_tokens_seen': 12824455, 'train_runtime': '6487', 'train_tokens_per_second': '1977'} +{'loss': '0.5834', 'grad_norm': '0.9479', 'learning_rate': '4.998e-05', 'epoch': '0.1578', 'num_input_tokens_seen': 12826502, 'train_runtime': '6488', 'train_tokens_per_second': '1977'} +{'loss': '0.4974', 'grad_norm': '0.9793', 'learning_rate': '4.998e-05', 'epoch': '0.1578', 'num_input_tokens_seen': 12828549, 'train_runtime': '6489', 'train_tokens_per_second': '1977'} +{'loss': '0.5361', 'grad_norm': '1.021', 'learning_rate': '4.998e-05', 'epoch': '0.1578', 'num_input_tokens_seen': 12830596, 'train_runtime': '6490', 'train_tokens_per_second': '1977'} +{'loss': '1.434', 'grad_norm': '2.25', 'learning_rate': '4.998e-05', 'epoch': '0.1578', 'num_input_tokens_seen': 12832643, 'train_runtime': '6491', 'train_tokens_per_second': '1977'} +{'loss': '0.4012', 'grad_norm': '1.094', 'learning_rate': '4.998e-05', 'epoch': '0.1579', 'num_input_tokens_seen': 12834690, 'train_runtime': '6492', 'train_tokens_per_second': '1977'} +{'loss': '1.831', 'grad_norm': '2.115', 'learning_rate': '4.998e-05', 'epoch': '0.1579', 'num_input_tokens_seen': 12836737, 'train_runtime': '6493', 'train_tokens_per_second': '1977'} +{'loss': '2.19', 'grad_norm': '2.389', 'learning_rate': '4.998e-05', 'epoch': '0.1579', 'num_input_tokens_seen': 12838784, 'train_runtime': '6494', 'train_tokens_per_second': '1977'} +{'loss': '1.084', 'grad_norm': '1.572', 'learning_rate': '4.998e-05', 'epoch': '0.1579', 'num_input_tokens_seen': 12840831, 'train_runtime': '6495', 'train_tokens_per_second': '1977'} +{'loss': '0.2224', 'grad_norm': '0.8327', 'learning_rate': '4.998e-05', 'epoch': '0.158', 'num_input_tokens_seen': 12842878, 'train_runtime': '6496', 'train_tokens_per_second': '1977'} +{'loss': '1.145', 'grad_norm': '1.64', 'learning_rate': '4.998e-05', 'epoch': '0.158', 'num_input_tokens_seen': 12844925, 'train_runtime': '6497', 'train_tokens_per_second': '1977'} +{'loss': '2.259', 'grad_norm': '2.764', 'learning_rate': '4.998e-05', 'epoch': '0.158', 'num_input_tokens_seen': 12846972, 'train_runtime': '6498', 'train_tokens_per_second': '1977'} +{'loss': '0.7081', 'grad_norm': '1.523', 'learning_rate': '4.998e-05', 'epoch': '0.158', 'num_input_tokens_seen': 12849019, 'train_runtime': '6499', 'train_tokens_per_second': '1977'} +{'loss': '0.7429', 'grad_norm': '1.67', 'learning_rate': '4.998e-05', 'epoch': '0.1581', 'num_input_tokens_seen': 12851066, 'train_runtime': '6500', 'train_tokens_per_second': '1977'} +{'loss': '0.2267', 'grad_norm': '1.019', 'learning_rate': '4.998e-05', 'epoch': '0.1581', 'num_input_tokens_seen': 12853113, 'train_runtime': '6502', 'train_tokens_per_second': '1977'} +{'loss': '0.7638', 'grad_norm': '1.435', 'learning_rate': '4.998e-05', 'epoch': '0.1581', 'num_input_tokens_seen': 12855160, 'train_runtime': '6503', 'train_tokens_per_second': '1977'} +{'loss': '0.7734', 'grad_norm': '1.924', 'learning_rate': '4.998e-05', 'epoch': '0.1581', 'num_input_tokens_seen': 12857207, 'train_runtime': '6504', 'train_tokens_per_second': '1977'} +{'loss': '0.6088', 'grad_norm': '0.964', 'learning_rate': '4.998e-05', 'epoch': '0.1582', 'num_input_tokens_seen': 12859254, 'train_runtime': '6505', 'train_tokens_per_second': '1977'} +{'loss': '0.9235', 'grad_norm': '1.216', 'learning_rate': '4.998e-05', 'epoch': '0.1582', 'num_input_tokens_seen': 12861301, 'train_runtime': '6506', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '1.697', 'learning_rate': '4.998e-05', 'epoch': '0.1582', 'num_input_tokens_seen': 12863348, 'train_runtime': '6507', 'train_tokens_per_second': '1977'} +{'loss': '0.9071', 'grad_norm': '1.535', 'learning_rate': '4.998e-05', 'epoch': '0.1582', 'num_input_tokens_seen': 12865395, 'train_runtime': '6508', 'train_tokens_per_second': '1977'} +{'loss': '2.035', 'grad_norm': '2.485', 'learning_rate': '4.998e-05', 'epoch': '0.1583', 'num_input_tokens_seen': 12867442, 'train_runtime': '6509', 'train_tokens_per_second': '1977'} +{'loss': '0.4963', 'grad_norm': '1.03', 'learning_rate': '4.998e-05', 'epoch': '0.1583', 'num_input_tokens_seen': 12869489, 'train_runtime': '6510', 'train_tokens_per_second': '1977'} +{'loss': '0.7864', 'grad_norm': '1.539', 'learning_rate': '4.998e-05', 'epoch': '0.1583', 'num_input_tokens_seen': 12871536, 'train_runtime': '6511', 'train_tokens_per_second': '1977'} +{'loss': '0.8855', 'grad_norm': '1.272', 'learning_rate': '4.998e-05', 'epoch': '0.1583', 'num_input_tokens_seen': 12873583, 'train_runtime': '6512', 'train_tokens_per_second': '1977'} +{'loss': '1.545', 'grad_norm': '2.096', 'learning_rate': '4.998e-05', 'epoch': '0.1584', 'num_input_tokens_seen': 12875630, 'train_runtime': '6513', 'train_tokens_per_second': '1977'} +{'loss': '0.3551', 'grad_norm': '1.11', 'learning_rate': '4.998e-05', 'epoch': '0.1584', 'num_input_tokens_seen': 12877677, 'train_runtime': '6514', 'train_tokens_per_second': '1977'} +{'loss': '1.247', 'grad_norm': '1.773', 'learning_rate': '4.998e-05', 'epoch': '0.1584', 'num_input_tokens_seen': 12879724, 'train_runtime': '6515', 'train_tokens_per_second': '1977'} +{'loss': '0.6715', 'grad_norm': '1.376', 'learning_rate': '4.998e-05', 'epoch': '0.1584', 'num_input_tokens_seen': 12881771, 'train_runtime': '6516', 'train_tokens_per_second': '1977'} +{'loss': '0.9562', 'grad_norm': '1.755', 'learning_rate': '4.998e-05', 'epoch': '0.1585', 'num_input_tokens_seen': 12883818, 'train_runtime': '6517', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '1.795', 'learning_rate': '4.998e-05', 'epoch': '0.1585', 'num_input_tokens_seen': 12885865, 'train_runtime': '6518', 'train_tokens_per_second': '1977'} +{'loss': '0.3171', 'grad_norm': '0.8975', 'learning_rate': '4.998e-05', 'epoch': '0.1585', 'num_input_tokens_seen': 12887912, 'train_runtime': '6519', 'train_tokens_per_second': '1977'} +{'loss': '1.213', 'grad_norm': '1.483', 'learning_rate': '4.998e-05', 'epoch': '0.1585', 'num_input_tokens_seen': 12889959, 'train_runtime': '6520', 'train_tokens_per_second': '1977'} +{'loss': '0.5926', 'grad_norm': '1.375', 'learning_rate': '4.998e-05', 'epoch': '0.1586', 'num_input_tokens_seen': 12892006, 'train_runtime': '6521', 'train_tokens_per_second': '1977'} +{'loss': '0.816', 'grad_norm': '1.2', 'learning_rate': '4.998e-05', 'epoch': '0.1586', 'num_input_tokens_seen': 12894053, 'train_runtime': '6522', 'train_tokens_per_second': '1977'} +{'loss': '0.8141', 'grad_norm': '1.705', 'learning_rate': '4.998e-05', 'epoch': '0.1586', 'num_input_tokens_seen': 12896100, 'train_runtime': '6523', 'train_tokens_per_second': '1977'} +{'loss': '1.949', 'grad_norm': '2.353', 'learning_rate': '4.998e-05', 'epoch': '0.1586', 'num_input_tokens_seen': 12898147, 'train_runtime': '6524', 'train_tokens_per_second': '1977'} +{'loss': '0.3506', 'grad_norm': '0.842', 'learning_rate': '4.998e-05', 'epoch': '0.1587', 'num_input_tokens_seen': 12900194, 'train_runtime': '6525', 'train_tokens_per_second': '1977'} +{'loss': '0.3696', 'grad_norm': '0.6618', 'learning_rate': '4.998e-05', 'epoch': '0.1587', 'num_input_tokens_seen': 12902241, 'train_runtime': '6526', 'train_tokens_per_second': '1977'} +{'loss': '1.609', 'grad_norm': '2.367', 'learning_rate': '4.998e-05', 'epoch': '0.1587', 'num_input_tokens_seen': 12904288, 'train_runtime': '6527', 'train_tokens_per_second': '1977'} +{'loss': '0.5404', 'grad_norm': '1.445', 'learning_rate': '4.998e-05', 'epoch': '0.1587', 'num_input_tokens_seen': 12906335, 'train_runtime': '6528', 'train_tokens_per_second': '1977'} +{'loss': '0.4569', 'grad_norm': '1.076', 'learning_rate': '4.998e-05', 'epoch': '0.1588', 'num_input_tokens_seen': 12908382, 'train_runtime': '6529', 'train_tokens_per_second': '1977'} +{'loss': '1.202', 'grad_norm': '1.638', 'learning_rate': '4.998e-05', 'epoch': '0.1588', 'num_input_tokens_seen': 12910429, 'train_runtime': '6530', 'train_tokens_per_second': '1977'} +{'loss': '0.7454', 'grad_norm': '1.176', 'learning_rate': '4.998e-05', 'epoch': '0.1588', 'num_input_tokens_seen': 12912476, 'train_runtime': '6532', 'train_tokens_per_second': '1977'} +{'loss': '0.6846', 'grad_norm': '1.483', 'learning_rate': '4.998e-05', 'epoch': '0.1588', 'num_input_tokens_seen': 12914523, 'train_runtime': '6533', 'train_tokens_per_second': '1977'} +{'loss': '1.84', 'grad_norm': '2.288', 'learning_rate': '4.998e-05', 'epoch': '0.1589', 'num_input_tokens_seen': 12916570, 'train_runtime': '6534', 'train_tokens_per_second': '1977'} +{'loss': '1.079', 'grad_norm': '1.564', 'learning_rate': '4.998e-05', 'epoch': '0.1589', 'num_input_tokens_seen': 12918617, 'train_runtime': '6535', 'train_tokens_per_second': '1977'} +{'loss': '0.2762', 'grad_norm': '0.9045', 'learning_rate': '4.998e-05', 'epoch': '0.1589', 'num_input_tokens_seen': 12920664, 'train_runtime': '6536', 'train_tokens_per_second': '1977'} +{'loss': '1.508', 'grad_norm': '2.077', 'learning_rate': '4.998e-05', 'epoch': '0.1589', 'num_input_tokens_seen': 12922711, 'train_runtime': '6537', 'train_tokens_per_second': '1977'} +{'loss': '0.3941', 'grad_norm': '0.989', 'learning_rate': '4.998e-05', 'epoch': '0.159', 'num_input_tokens_seen': 12924758, 'train_runtime': '6538', 'train_tokens_per_second': '1977'} +{'loss': '0.7305', 'grad_norm': '1.349', 'learning_rate': '4.998e-05', 'epoch': '0.159', 'num_input_tokens_seen': 12926805, 'train_runtime': '6539', 'train_tokens_per_second': '1977'} +{'loss': '0.461', 'grad_norm': '1.135', 'learning_rate': '4.998e-05', 'epoch': '0.159', 'num_input_tokens_seen': 12928852, 'train_runtime': '6540', 'train_tokens_per_second': '1977'} +{'loss': '0.6381', 'grad_norm': '1.389', 'learning_rate': '4.998e-05', 'epoch': '0.1591', 'num_input_tokens_seen': 12930899, 'train_runtime': '6541', 'train_tokens_per_second': '1977'} +{'loss': '0.3861', 'grad_norm': '1.155', 'learning_rate': '4.998e-05', 'epoch': '0.1591', 'num_input_tokens_seen': 12932946, 'train_runtime': '6542', 'train_tokens_per_second': '1977'} +{'loss': '1.073', 'grad_norm': '1.906', 'learning_rate': '4.998e-05', 'epoch': '0.1591', 'num_input_tokens_seen': 12934993, 'train_runtime': '6543', 'train_tokens_per_second': '1977'} +{'loss': '1.491', 'grad_norm': '2.333', 'learning_rate': '4.998e-05', 'epoch': '0.1591', 'num_input_tokens_seen': 12937040, 'train_runtime': '6544', 'train_tokens_per_second': '1977'} +{'loss': '0.6532', 'grad_norm': '1.591', 'learning_rate': '4.998e-05', 'epoch': '0.1592', 'num_input_tokens_seen': 12939087, 'train_runtime': '6545', 'train_tokens_per_second': '1977'} +{'loss': '0.96', 'grad_norm': '1.846', 'learning_rate': '4.998e-05', 'epoch': '0.1592', 'num_input_tokens_seen': 12941134, 'train_runtime': '6546', 'train_tokens_per_second': '1977'} +{'loss': '0.4871', 'grad_norm': '1.253', 'learning_rate': '4.998e-05', 'epoch': '0.1592', 'num_input_tokens_seen': 12943181, 'train_runtime': '6547', 'train_tokens_per_second': '1977'} +{'loss': '0.296', 'grad_norm': '1.197', 'learning_rate': '4.998e-05', 'epoch': '0.1592', 'num_input_tokens_seen': 12945228, 'train_runtime': '6548', 'train_tokens_per_second': '1977'} +{'loss': '0.4191', 'grad_norm': '1.289', 'learning_rate': '4.998e-05', 'epoch': '0.1593', 'num_input_tokens_seen': 12947275, 'train_runtime': '6549', 'train_tokens_per_second': '1977'} +{'loss': '1.309', 'grad_norm': '1.954', 'learning_rate': '4.998e-05', 'epoch': '0.1593', 'num_input_tokens_seen': 12949322, 'train_runtime': '6550', 'train_tokens_per_second': '1977'} +{'loss': '0.3948', 'grad_norm': '0.9709', 'learning_rate': '4.998e-05', 'epoch': '0.1593', 'num_input_tokens_seen': 12951369, 'train_runtime': '6551', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '1.552', 'learning_rate': '4.998e-05', 'epoch': '0.1593', 'num_input_tokens_seen': 12953416, 'train_runtime': '6552', 'train_tokens_per_second': '1977'} +{'loss': '0.5266', 'grad_norm': '1.156', 'learning_rate': '4.998e-05', 'epoch': '0.1594', 'num_input_tokens_seen': 12955463, 'train_runtime': '6553', 'train_tokens_per_second': '1977'} +{'loss': '0.441', 'grad_norm': '1.003', 'learning_rate': '4.998e-05', 'epoch': '0.1594', 'num_input_tokens_seen': 12957510, 'train_runtime': '6554', 'train_tokens_per_second': '1977'} +{'loss': '2.287', 'grad_norm': '2.694', 'learning_rate': '4.998e-05', 'epoch': '0.1594', 'num_input_tokens_seen': 12959557, 'train_runtime': '6555', 'train_tokens_per_second': '1977'} +{'loss': '0.3081', 'grad_norm': '1.042', 'learning_rate': '4.998e-05', 'epoch': '0.1594', 'num_input_tokens_seen': 12961604, 'train_runtime': '6556', 'train_tokens_per_second': '1977'} +{'loss': '0.3936', 'grad_norm': '1.03', 'learning_rate': '4.998e-05', 'epoch': '0.1595', 'num_input_tokens_seen': 12963651, 'train_runtime': '6557', 'train_tokens_per_second': '1977'} +{'loss': '0.9179', 'grad_norm': '1.564', 'learning_rate': '4.998e-05', 'epoch': '0.1595', 'num_input_tokens_seen': 12965698, 'train_runtime': '6558', 'train_tokens_per_second': '1977'} +{'loss': '0.7402', 'grad_norm': '1.266', 'learning_rate': '4.998e-05', 'epoch': '0.1595', 'num_input_tokens_seen': 12967745, 'train_runtime': '6559', 'train_tokens_per_second': '1977'} +{'loss': '0.3475', 'grad_norm': '0.8639', 'learning_rate': '4.998e-05', 'epoch': '0.1595', 'num_input_tokens_seen': 12969792, 'train_runtime': '6560', 'train_tokens_per_second': '1977'} +{'loss': '0.8392', 'grad_norm': '1.531', 'learning_rate': '4.998e-05', 'epoch': '0.1596', 'num_input_tokens_seen': 12971839, 'train_runtime': '6562', 'train_tokens_per_second': '1977'} +{'loss': '0.3549', 'grad_norm': '1.068', 'learning_rate': '4.998e-05', 'epoch': '0.1596', 'num_input_tokens_seen': 12973886, 'train_runtime': '6563', 'train_tokens_per_second': '1977'} +{'loss': '0.4409', 'grad_norm': '1.093', 'learning_rate': '4.998e-05', 'epoch': '0.1596', 'num_input_tokens_seen': 12975933, 'train_runtime': '6564', 'train_tokens_per_second': '1977'} +{'loss': '1.087', 'grad_norm': '1.487', 'learning_rate': '4.998e-05', 'epoch': '0.1596', 'num_input_tokens_seen': 12977980, 'train_runtime': '6565', 'train_tokens_per_second': '1977'} +{'loss': '1.268', 'grad_norm': '1.947', 'learning_rate': '4.998e-05', 'epoch': '0.1597', 'num_input_tokens_seen': 12980027, 'train_runtime': '6566', 'train_tokens_per_second': '1977'} +{'loss': '0.6117', 'grad_norm': '1.227', 'learning_rate': '4.998e-05', 'epoch': '0.1597', 'num_input_tokens_seen': 12982074, 'train_runtime': '6567', 'train_tokens_per_second': '1977'} +{'loss': '0.911', 'grad_norm': '1.518', 'learning_rate': '4.998e-05', 'epoch': '0.1597', 'num_input_tokens_seen': 12984121, 'train_runtime': '6568', 'train_tokens_per_second': '1977'} +{'loss': '0.382', 'grad_norm': '1.197', 'learning_rate': '4.998e-05', 'epoch': '0.1597', 'num_input_tokens_seen': 12986168, 'train_runtime': '6569', 'train_tokens_per_second': '1977'} +{'loss': '0.4482', 'grad_norm': '1.089', 'learning_rate': '4.998e-05', 'epoch': '0.1598', 'num_input_tokens_seen': 12988215, 'train_runtime': '6570', 'train_tokens_per_second': '1977'} +{'loss': '0.9365', 'grad_norm': '1.395', 'learning_rate': '4.998e-05', 'epoch': '0.1598', 'num_input_tokens_seen': 12990262, 'train_runtime': '6571', 'train_tokens_per_second': '1977'} +{'loss': '0.5959', 'grad_norm': '1.116', 'learning_rate': '4.998e-05', 'epoch': '0.1598', 'num_input_tokens_seen': 12992309, 'train_runtime': '6572', 'train_tokens_per_second': '1977'} +{'loss': '0.4834', 'grad_norm': '1.141', 'learning_rate': '4.998e-05', 'epoch': '0.1598', 'num_input_tokens_seen': 12994356, 'train_runtime': '6573', 'train_tokens_per_second': '1977'} +{'loss': '1.097', 'grad_norm': '1.856', 'learning_rate': '4.998e-05', 'epoch': '0.1599', 'num_input_tokens_seen': 12996403, 'train_runtime': '6574', 'train_tokens_per_second': '1977'} +{'loss': '0.798', 'grad_norm': '1.537', 'learning_rate': '4.998e-05', 'epoch': '0.1599', 'num_input_tokens_seen': 12998450, 'train_runtime': '6575', 'train_tokens_per_second': '1977'} +{'loss': '1.518', 'grad_norm': '2.072', 'learning_rate': '4.998e-05', 'epoch': '0.1599', 'num_input_tokens_seen': 13000497, 'train_runtime': '6576', 'train_tokens_per_second': '1977'} +{'loss': '0.5299', 'grad_norm': '1.168', 'learning_rate': '4.998e-05', 'epoch': '0.1599', 'num_input_tokens_seen': 13002544, 'train_runtime': '6577', 'train_tokens_per_second': '1977'} +{'loss': '0.3907', 'grad_norm': '1.138', 'learning_rate': '4.998e-05', 'epoch': '0.16', 'num_input_tokens_seen': 13004591, 'train_runtime': '6578', 'train_tokens_per_second': '1977'} +{'loss': '0.4326', 'grad_norm': '1.121', 'learning_rate': '4.998e-05', 'epoch': '0.16', 'num_input_tokens_seen': 13006638, 'train_runtime': '6579', 'train_tokens_per_second': '1977'} +{'loss': '0.5294', 'grad_norm': '1.011', 'learning_rate': '4.998e-05', 'epoch': '0.16', 'num_input_tokens_seen': 13008685, 'train_runtime': '6580', 'train_tokens_per_second': '1977'} +{'loss': '0.4368', 'grad_norm': '1.438', 'learning_rate': '4.998e-05', 'epoch': '0.16', 'num_input_tokens_seen': 13010732, 'train_runtime': '6581', 'train_tokens_per_second': '1977'} +{'loss': '1.145', 'grad_norm': '1.639', 'learning_rate': '4.998e-05', 'epoch': '0.1601', 'num_input_tokens_seen': 13012779, 'train_runtime': '6582', 'train_tokens_per_second': '1977'} +{'loss': '0.9827', 'grad_norm': '1.491', 'learning_rate': '4.998e-05', 'epoch': '0.1601', 'num_input_tokens_seen': 13014826, 'train_runtime': '6583', 'train_tokens_per_second': '1977'} +{'loss': '1.609', 'grad_norm': '2.313', 'learning_rate': '4.998e-05', 'epoch': '0.1601', 'num_input_tokens_seen': 13016873, 'train_runtime': '6584', 'train_tokens_per_second': '1977'} +{'loss': '0.7238', 'grad_norm': '1.344', 'learning_rate': '4.998e-05', 'epoch': '0.1601', 'num_input_tokens_seen': 13018920, 'train_runtime': '6585', 'train_tokens_per_second': '1977'} +{'loss': '2.547', 'grad_norm': '2.315', 'learning_rate': '4.998e-05', 'epoch': '0.1602', 'num_input_tokens_seen': 13020967, 'train_runtime': '6586', 'train_tokens_per_second': '1977'} +{'loss': '0.3442', 'grad_norm': '1.206', 'learning_rate': '4.998e-05', 'epoch': '0.1602', 'num_input_tokens_seen': 13023014, 'train_runtime': '6587', 'train_tokens_per_second': '1977'} +{'loss': '1.713', 'grad_norm': '2.037', 'learning_rate': '4.998e-05', 'epoch': '0.1602', 'num_input_tokens_seen': 13025061, 'train_runtime': '6588', 'train_tokens_per_second': '1977'} +{'loss': '0.4129', 'grad_norm': '1.213', 'learning_rate': '4.998e-05', 'epoch': '0.1602', 'num_input_tokens_seen': 13027108, 'train_runtime': '6589', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '1.987', 'learning_rate': '4.998e-05', 'epoch': '0.1603', 'num_input_tokens_seen': 13029155, 'train_runtime': '6590', 'train_tokens_per_second': '1977'} +{'loss': '1.028', 'grad_norm': '1.589', 'learning_rate': '4.998e-05', 'epoch': '0.1603', 'num_input_tokens_seen': 13031202, 'train_runtime': '6592', 'train_tokens_per_second': '1977'} +{'loss': '0.8952', 'grad_norm': '1.645', 'learning_rate': '4.998e-05', 'epoch': '0.1603', 'num_input_tokens_seen': 13033249, 'train_runtime': '6593', 'train_tokens_per_second': '1977'} +{'loss': '1.635', 'grad_norm': '2.123', 'learning_rate': '4.998e-05', 'epoch': '0.1603', 'num_input_tokens_seen': 13035296, 'train_runtime': '6594', 'train_tokens_per_second': '1977'} +{'loss': '1.313', 'grad_norm': '1.757', 'learning_rate': '4.998e-05', 'epoch': '0.1604', 'num_input_tokens_seen': 13037343, 'train_runtime': '6595', 'train_tokens_per_second': '1977'} +{'loss': '1.142', 'grad_norm': '2.219', 'learning_rate': '4.998e-05', 'epoch': '0.1604', 'num_input_tokens_seen': 13039390, 'train_runtime': '6596', 'train_tokens_per_second': '1977'} +{'loss': '1.095', 'grad_norm': '1.906', 'learning_rate': '4.998e-05', 'epoch': '0.1604', 'num_input_tokens_seen': 13041437, 'train_runtime': '6597', 'train_tokens_per_second': '1977'} +{'loss': '0.3312', 'grad_norm': '0.9118', 'learning_rate': '4.998e-05', 'epoch': '0.1604', 'num_input_tokens_seen': 13043484, 'train_runtime': '6598', 'train_tokens_per_second': '1977'} +{'loss': '0.8667', 'grad_norm': '1.324', 'learning_rate': '4.998e-05', 'epoch': '0.1605', 'num_input_tokens_seen': 13045531, 'train_runtime': '6599', 'train_tokens_per_second': '1977'} +{'loss': '0.9398', 'grad_norm': '1.516', 'learning_rate': '4.998e-05', 'epoch': '0.1605', 'num_input_tokens_seen': 13047578, 'train_runtime': '6600', 'train_tokens_per_second': '1977'} +{'loss': '0.7649', 'grad_norm': '1.309', 'learning_rate': '4.998e-05', 'epoch': '0.1605', 'num_input_tokens_seen': 13049625, 'train_runtime': '6601', 'train_tokens_per_second': '1977'} +{'loss': '1.776', 'grad_norm': '1.937', 'learning_rate': '4.998e-05', 'epoch': '0.1605', 'num_input_tokens_seen': 13051672, 'train_runtime': '6602', 'train_tokens_per_second': '1977'} +{'loss': '1.544', 'grad_norm': '2.232', 'learning_rate': '4.998e-05', 'epoch': '0.1606', 'num_input_tokens_seen': 13053719, 'train_runtime': '6603', 'train_tokens_per_second': '1977'} +{'loss': '0.8666', 'grad_norm': '1.589', 'learning_rate': '4.998e-05', 'epoch': '0.1606', 'num_input_tokens_seen': 13055766, 'train_runtime': '6604', 'train_tokens_per_second': '1977'} +{'loss': '0.3024', 'grad_norm': '1.06', 'learning_rate': '4.998e-05', 'epoch': '0.1606', 'num_input_tokens_seen': 13057813, 'train_runtime': '6605', 'train_tokens_per_second': '1977'} +{'loss': '0.4393', 'grad_norm': '2.597', 'learning_rate': '4.998e-05', 'epoch': '0.1606', 'num_input_tokens_seen': 13059860, 'train_runtime': '6606', 'train_tokens_per_second': '1977'} +{'loss': '1.116', 'grad_norm': '1.723', 'learning_rate': '4.998e-05', 'epoch': '0.1607', 'num_input_tokens_seen': 13061907, 'train_runtime': '6607', 'train_tokens_per_second': '1977'} +{'loss': '1.111', 'grad_norm': '1.586', 'learning_rate': '4.998e-05', 'epoch': '0.1607', 'num_input_tokens_seen': 13063954, 'train_runtime': '6608', 'train_tokens_per_second': '1977'} +{'loss': '1.752', 'grad_norm': '2.211', 'learning_rate': '4.998e-05', 'epoch': '0.1607', 'num_input_tokens_seen': 13066001, 'train_runtime': '6609', 'train_tokens_per_second': '1977'} +{'loss': '0.5773', 'grad_norm': '1.147', 'learning_rate': '4.998e-05', 'epoch': '0.1607', 'num_input_tokens_seen': 13068048, 'train_runtime': '6610', 'train_tokens_per_second': '1977'} +{'loss': '2.552', 'grad_norm': '2.675', 'learning_rate': '4.998e-05', 'epoch': '0.1608', 'num_input_tokens_seen': 13070095, 'train_runtime': '6611', 'train_tokens_per_second': '1977'} +{'loss': '0.7522', 'grad_norm': '1.254', 'learning_rate': '4.998e-05', 'epoch': '0.1608', 'num_input_tokens_seen': 13072142, 'train_runtime': '6612', 'train_tokens_per_second': '1977'} +{'loss': '0.5338', 'grad_norm': '1.247', 'learning_rate': '4.998e-05', 'epoch': '0.1608', 'num_input_tokens_seen': 13074189, 'train_runtime': '6613', 'train_tokens_per_second': '1977'} +{'loss': '0.6611', 'grad_norm': '1.161', 'learning_rate': '4.998e-05', 'epoch': '0.1608', 'num_input_tokens_seen': 13076236, 'train_runtime': '6614', 'train_tokens_per_second': '1977'} +{'loss': '0.9079', 'grad_norm': '1.32', 'learning_rate': '4.998e-05', 'epoch': '0.1609', 'num_input_tokens_seen': 13078283, 'train_runtime': '6615', 'train_tokens_per_second': '1977'} +{'loss': '0.3438', 'grad_norm': '0.9161', 'learning_rate': '4.998e-05', 'epoch': '0.1609', 'num_input_tokens_seen': 13080330, 'train_runtime': '6616', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '1.929', 'learning_rate': '4.998e-05', 'epoch': '0.1609', 'num_input_tokens_seen': 13082377, 'train_runtime': '6617', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '1.668', 'learning_rate': '4.998e-05', 'epoch': '0.1609', 'num_input_tokens_seen': 13084424, 'train_runtime': '6618', 'train_tokens_per_second': '1977'} +{'loss': '1.916', 'grad_norm': '2.396', 'learning_rate': '4.998e-05', 'epoch': '0.161', 'num_input_tokens_seen': 13086471, 'train_runtime': '6619', 'train_tokens_per_second': '1977'} +{'loss': '0.3952', 'grad_norm': '1.125', 'learning_rate': '4.998e-05', 'epoch': '0.161', 'num_input_tokens_seen': 13088518, 'train_runtime': '6620', 'train_tokens_per_second': '1977'} +{'loss': '0.5825', 'grad_norm': '1.36', 'learning_rate': '4.998e-05', 'epoch': '0.161', 'num_input_tokens_seen': 13090565, 'train_runtime': '6622', 'train_tokens_per_second': '1977'} +{'loss': '1.795', 'grad_norm': '2.706', 'learning_rate': '4.998e-05', 'epoch': '0.161', 'num_input_tokens_seen': 13092612, 'train_runtime': '6623', 'train_tokens_per_second': '1977'} +{'loss': '0.3395', 'grad_norm': '1.094', 'learning_rate': '4.998e-05', 'epoch': '0.1611', 'num_input_tokens_seen': 13094659, 'train_runtime': '6624', 'train_tokens_per_second': '1977'} +{'loss': '0.9198', 'grad_norm': '1.875', 'learning_rate': '4.998e-05', 'epoch': '0.1611', 'num_input_tokens_seen': 13096706, 'train_runtime': '6625', 'train_tokens_per_second': '1977'} +{'loss': '1.308', 'grad_norm': '2.012', 'learning_rate': '4.998e-05', 'epoch': '0.1611', 'num_input_tokens_seen': 13098753, 'train_runtime': '6626', 'train_tokens_per_second': '1977'} +{'loss': '0.3657', 'grad_norm': '0.9132', 'learning_rate': '4.998e-05', 'epoch': '0.1611', 'num_input_tokens_seen': 13100800, 'train_runtime': '6627', 'train_tokens_per_second': '1977'} +{'loss': '0.2757', 'grad_norm': '0.9738', 'learning_rate': '4.998e-05', 'epoch': '0.1612', 'num_input_tokens_seen': 13102847, 'train_runtime': '6628', 'train_tokens_per_second': '1977'} +{'loss': '0.8514', 'grad_norm': '1.647', 'learning_rate': '4.998e-05', 'epoch': '0.1612', 'num_input_tokens_seen': 13104894, 'train_runtime': '6629', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '1.572', 'learning_rate': '4.998e-05', 'epoch': '0.1612', 'num_input_tokens_seen': 13106941, 'train_runtime': '6630', 'train_tokens_per_second': '1977'} +{'loss': '0.62', 'grad_norm': '1.382', 'learning_rate': '4.998e-05', 'epoch': '0.1612', 'num_input_tokens_seen': 13108988, 'train_runtime': '6631', 'train_tokens_per_second': '1977'} +{'loss': '0.744', 'grad_norm': '1.756', 'learning_rate': '4.998e-05', 'epoch': '0.1613', 'num_input_tokens_seen': 13111035, 'train_runtime': '6632', 'train_tokens_per_second': '1977'} +{'loss': '1.26', 'grad_norm': '2.071', 'learning_rate': '4.998e-05', 'epoch': '0.1613', 'num_input_tokens_seen': 13113082, 'train_runtime': '6633', 'train_tokens_per_second': '1977'} +{'loss': '0.8718', 'grad_norm': '1.492', 'learning_rate': '4.998e-05', 'epoch': '0.1613', 'num_input_tokens_seen': 13115129, 'train_runtime': '6634', 'train_tokens_per_second': '1977'} +{'loss': '0.311', 'grad_norm': '1.012', 'learning_rate': '4.998e-05', 'epoch': '0.1613', 'num_input_tokens_seen': 13117176, 'train_runtime': '6635', 'train_tokens_per_second': '1977'} +{'loss': '0.6954', 'grad_norm': '1.34', 'learning_rate': '4.998e-05', 'epoch': '0.1614', 'num_input_tokens_seen': 13119223, 'train_runtime': '6636', 'train_tokens_per_second': '1977'} +{'loss': '1.924', 'grad_norm': '2.614', 'learning_rate': '4.998e-05', 'epoch': '0.1614', 'num_input_tokens_seen': 13121270, 'train_runtime': '6637', 'train_tokens_per_second': '1977'} +{'loss': '0.7339', 'grad_norm': '1.648', 'learning_rate': '4.998e-05', 'epoch': '0.1614', 'num_input_tokens_seen': 13123317, 'train_runtime': '6638', 'train_tokens_per_second': '1977'} +{'loss': '0.7505', 'grad_norm': '1.51', 'learning_rate': '4.998e-05', 'epoch': '0.1614', 'num_input_tokens_seen': 13125364, 'train_runtime': '6639', 'train_tokens_per_second': '1977'} +{'loss': '1.13', 'grad_norm': '1.858', 'learning_rate': '4.998e-05', 'epoch': '0.1615', 'num_input_tokens_seen': 13127411, 'train_runtime': '6640', 'train_tokens_per_second': '1977'} +{'loss': '0.5351', 'grad_norm': '1.397', 'learning_rate': '4.998e-05', 'epoch': '0.1615', 'num_input_tokens_seen': 13129458, 'train_runtime': '6641', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.572', 'learning_rate': '4.998e-05', 'epoch': '0.1615', 'num_input_tokens_seen': 13131505, 'train_runtime': '6642', 'train_tokens_per_second': '1977'} +{'loss': '0.3856', 'grad_norm': '1.138', 'learning_rate': '4.998e-05', 'epoch': '0.1615', 'num_input_tokens_seen': 13133552, 'train_runtime': '6643', 'train_tokens_per_second': '1977'} +{'loss': '0.4903', 'grad_norm': '1.159', 'learning_rate': '4.998e-05', 'epoch': '0.1616', 'num_input_tokens_seen': 13135599, 'train_runtime': '6644', 'train_tokens_per_second': '1977'} +{'loss': '0.7706', 'grad_norm': '1.539', 'learning_rate': '4.998e-05', 'epoch': '0.1616', 'num_input_tokens_seen': 13137646, 'train_runtime': '6645', 'train_tokens_per_second': '1977'} +{'loss': '0.919', 'grad_norm': '1.252', 'learning_rate': '4.998e-05', 'epoch': '0.1616', 'num_input_tokens_seen': 13139693, 'train_runtime': '6646', 'train_tokens_per_second': '1977'} +{'loss': '0.8794', 'grad_norm': '1.69', 'learning_rate': '4.998e-05', 'epoch': '0.1616', 'num_input_tokens_seen': 13141740, 'train_runtime': '6647', 'train_tokens_per_second': '1977'} +{'loss': '0.4519', 'grad_norm': '1.119', 'learning_rate': '4.998e-05', 'epoch': '0.1617', 'num_input_tokens_seen': 13143787, 'train_runtime': '6648', 'train_tokens_per_second': '1977'} +{'loss': '1.932', 'grad_norm': '2.283', 'learning_rate': '4.998e-05', 'epoch': '0.1617', 'num_input_tokens_seen': 13145834, 'train_runtime': '6649', 'train_tokens_per_second': '1977'} +{'loss': '0.8907', 'grad_norm': '1.447', 'learning_rate': '4.998e-05', 'epoch': '0.1617', 'num_input_tokens_seen': 13147881, 'train_runtime': '6650', 'train_tokens_per_second': '1977'} +{'loss': '0.2939', 'grad_norm': '0.9103', 'learning_rate': '4.998e-05', 'epoch': '0.1617', 'num_input_tokens_seen': 13149928, 'train_runtime': '6651', 'train_tokens_per_second': '1977'} +{'loss': '0.71', 'grad_norm': '1.158', 'learning_rate': '4.998e-05', 'epoch': '0.1618', 'num_input_tokens_seen': 13151975, 'train_runtime': '6652', 'train_tokens_per_second': '1977'} +{'loss': '0.9886', 'grad_norm': '1.596', 'learning_rate': '4.998e-05', 'epoch': '0.1618', 'num_input_tokens_seen': 13154022, 'train_runtime': '6654', 'train_tokens_per_second': '1977'} +{'loss': '0.6387', 'grad_norm': '1.453', 'learning_rate': '4.998e-05', 'epoch': '0.1618', 'num_input_tokens_seen': 13156069, 'train_runtime': '6655', 'train_tokens_per_second': '1977'} +{'loss': '2.013', 'grad_norm': '2.538', 'learning_rate': '4.998e-05', 'epoch': '0.1618', 'num_input_tokens_seen': 13158116, 'train_runtime': '6656', 'train_tokens_per_second': '1977'} +{'loss': '1.871', 'grad_norm': '2.545', 'learning_rate': '4.998e-05', 'epoch': '0.1619', 'num_input_tokens_seen': 13160163, 'train_runtime': '6657', 'train_tokens_per_second': '1977'} +{'loss': '0.6959', 'grad_norm': '1.574', 'learning_rate': '4.998e-05', 'epoch': '0.1619', 'num_input_tokens_seen': 13162210, 'train_runtime': '6658', 'train_tokens_per_second': '1977'} +{'loss': '0.6985', 'grad_norm': '1.416', 'learning_rate': '4.998e-05', 'epoch': '0.1619', 'num_input_tokens_seen': 13164257, 'train_runtime': '6659', 'train_tokens_per_second': '1977'} +{'loss': '1.59', 'grad_norm': '2.14', 'learning_rate': '4.998e-05', 'epoch': '0.1619', 'num_input_tokens_seen': 13166304, 'train_runtime': '6660', 'train_tokens_per_second': '1977'} +{'loss': '1.652', 'grad_norm': '2.677', 'learning_rate': '4.998e-05', 'epoch': '0.162', 'num_input_tokens_seen': 13168351, 'train_runtime': '6661', 'train_tokens_per_second': '1977'} +{'loss': '0.8537', 'grad_norm': '1.577', 'learning_rate': '4.998e-05', 'epoch': '0.162', 'num_input_tokens_seen': 13170398, 'train_runtime': '6662', 'train_tokens_per_second': '1977'} +{'loss': '0.7484', 'grad_norm': '1.486', 'learning_rate': '4.998e-05', 'epoch': '0.162', 'num_input_tokens_seen': 13172445, 'train_runtime': '6663', 'train_tokens_per_second': '1977'} +{'loss': '0.3649', 'grad_norm': '1.087', 'learning_rate': '4.998e-05', 'epoch': '0.162', 'num_input_tokens_seen': 13174492, 'train_runtime': '6664', 'train_tokens_per_second': '1977'} +{'loss': '0.8237', 'grad_norm': '1.027', 'learning_rate': '4.998e-05', 'epoch': '0.1621', 'num_input_tokens_seen': 13176539, 'train_runtime': '6665', 'train_tokens_per_second': '1977'} +{'loss': '1.716', 'grad_norm': '1.908', 'learning_rate': '4.998e-05', 'epoch': '0.1621', 'num_input_tokens_seen': 13178586, 'train_runtime': '6666', 'train_tokens_per_second': '1977'} +{'loss': '0.4611', 'grad_norm': '1.311', 'learning_rate': '4.998e-05', 'epoch': '0.1621', 'num_input_tokens_seen': 13180633, 'train_runtime': '6667', 'train_tokens_per_second': '1977'} +{'loss': '0.7082', 'grad_norm': '1.557', 'learning_rate': '4.998e-05', 'epoch': '0.1621', 'num_input_tokens_seen': 13182680, 'train_runtime': '6668', 'train_tokens_per_second': '1977'} +{'loss': '1.302', 'grad_norm': '1.459', 'learning_rate': '4.998e-05', 'epoch': '0.1622', 'num_input_tokens_seen': 13184727, 'train_runtime': '6669', 'train_tokens_per_second': '1977'} +{'loss': '1.6', 'grad_norm': '2.248', 'learning_rate': '4.998e-05', 'epoch': '0.1622', 'num_input_tokens_seen': 13186774, 'train_runtime': '6670', 'train_tokens_per_second': '1977'} +{'loss': '2.293', 'grad_norm': '3.822', 'learning_rate': '4.998e-05', 'epoch': '0.1622', 'num_input_tokens_seen': 13188821, 'train_runtime': '6671', 'train_tokens_per_second': '1977'} +{'loss': '0.78', 'grad_norm': '1.44', 'learning_rate': '4.998e-05', 'epoch': '0.1622', 'num_input_tokens_seen': 13190868, 'train_runtime': '6672', 'train_tokens_per_second': '1977'} +{'loss': '0.8015', 'grad_norm': '1.689', 'learning_rate': '4.998e-05', 'epoch': '0.1623', 'num_input_tokens_seen': 13192915, 'train_runtime': '6673', 'train_tokens_per_second': '1977'} +{'loss': '0.904', 'grad_norm': '1.585', 'learning_rate': '4.998e-05', 'epoch': '0.1623', 'num_input_tokens_seen': 13194962, 'train_runtime': '6674', 'train_tokens_per_second': '1977'} +{'loss': '0.751', 'grad_norm': '1.408', 'learning_rate': '4.998e-05', 'epoch': '0.1623', 'num_input_tokens_seen': 13197009, 'train_runtime': '6675', 'train_tokens_per_second': '1977'} +{'loss': '1.195', 'grad_norm': '2.371', 'learning_rate': '4.998e-05', 'epoch': '0.1623', 'num_input_tokens_seen': 13199056, 'train_runtime': '6676', 'train_tokens_per_second': '1977'} +{'loss': '1.706', 'grad_norm': '2.248', 'learning_rate': '4.998e-05', 'epoch': '0.1624', 'num_input_tokens_seen': 13201103, 'train_runtime': '6677', 'train_tokens_per_second': '1977'} +{'loss': '0.7225', 'grad_norm': '1.211', 'learning_rate': '4.998e-05', 'epoch': '0.1624', 'num_input_tokens_seen': 13203150, 'train_runtime': '6678', 'train_tokens_per_second': '1977'} +{'loss': '1.456', 'grad_norm': '2.543', 'learning_rate': '4.998e-05', 'epoch': '0.1624', 'num_input_tokens_seen': 13205197, 'train_runtime': '6679', 'train_tokens_per_second': '1977'} +{'loss': '0.5521', 'grad_norm': '1.549', 'learning_rate': '4.998e-05', 'epoch': '0.1624', 'num_input_tokens_seen': 13207244, 'train_runtime': '6680', 'train_tokens_per_second': '1977'} +{'loss': '0.244', 'grad_norm': '1.027', 'learning_rate': '4.998e-05', 'epoch': '0.1625', 'num_input_tokens_seen': 13209291, 'train_runtime': '6681', 'train_tokens_per_second': '1977'} +{'loss': '0.9221', 'grad_norm': '1.432', 'learning_rate': '4.998e-05', 'epoch': '0.1625', 'num_input_tokens_seen': 13211338, 'train_runtime': '6682', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '1.797', 'learning_rate': '4.998e-05', 'epoch': '0.1625', 'num_input_tokens_seen': 13213385, 'train_runtime': '6684', 'train_tokens_per_second': '1977'} +{'loss': '0.6531', 'grad_norm': '1.474', 'learning_rate': '4.998e-05', 'epoch': '0.1626', 'num_input_tokens_seen': 13215432, 'train_runtime': '6685', 'train_tokens_per_second': '1977'} +{'loss': '0.7652', 'grad_norm': '1.444', 'learning_rate': '4.998e-05', 'epoch': '0.1626', 'num_input_tokens_seen': 13217479, 'train_runtime': '6686', 'train_tokens_per_second': '1977'} +{'loss': '0.4463', 'grad_norm': '0.9979', 'learning_rate': '4.998e-05', 'epoch': '0.1626', 'num_input_tokens_seen': 13219526, 'train_runtime': '6687', 'train_tokens_per_second': '1977'} +{'loss': '0.8363', 'grad_norm': '1.454', 'learning_rate': '4.998e-05', 'epoch': '0.1626', 'num_input_tokens_seen': 13221573, 'train_runtime': '6688', 'train_tokens_per_second': '1977'} +{'loss': '0.8936', 'grad_norm': '1.459', 'learning_rate': '4.998e-05', 'epoch': '0.1627', 'num_input_tokens_seen': 13223620, 'train_runtime': '6689', 'train_tokens_per_second': '1977'} +{'loss': '0.8361', 'grad_norm': '1.84', 'learning_rate': '4.998e-05', 'epoch': '0.1627', 'num_input_tokens_seen': 13225667, 'train_runtime': '6690', 'train_tokens_per_second': '1977'} +{'loss': '1.816', 'grad_norm': '2.325', 'learning_rate': '4.998e-05', 'epoch': '0.1627', 'num_input_tokens_seen': 13227714, 'train_runtime': '6691', 'train_tokens_per_second': '1977'} +{'loss': '0.4669', 'grad_norm': '1.338', 'learning_rate': '4.998e-05', 'epoch': '0.1627', 'num_input_tokens_seen': 13229761, 'train_runtime': '6692', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '1.521', 'learning_rate': '4.998e-05', 'epoch': '0.1628', 'num_input_tokens_seen': 13231808, 'train_runtime': '6693', 'train_tokens_per_second': '1977'} +{'loss': '0.6867', 'grad_norm': '1.35', 'learning_rate': '4.998e-05', 'epoch': '0.1628', 'num_input_tokens_seen': 13233855, 'train_runtime': '6694', 'train_tokens_per_second': '1977'} +{'loss': '0.8191', 'grad_norm': '1.464', 'learning_rate': '4.998e-05', 'epoch': '0.1628', 'num_input_tokens_seen': 13235902, 'train_runtime': '6695', 'train_tokens_per_second': '1977'} +{'loss': '0.7869', 'grad_norm': '1.535', 'learning_rate': '4.998e-05', 'epoch': '0.1628', 'num_input_tokens_seen': 13237949, 'train_runtime': '6696', 'train_tokens_per_second': '1977'} +{'loss': '0.465', 'grad_norm': '0.9806', 'learning_rate': '4.998e-05', 'epoch': '0.1629', 'num_input_tokens_seen': 13239996, 'train_runtime': '6697', 'train_tokens_per_second': '1977'} +{'loss': '0.8856', 'grad_norm': '1.67', 'learning_rate': '4.998e-05', 'epoch': '0.1629', 'num_input_tokens_seen': 13242043, 'train_runtime': '6698', 'train_tokens_per_second': '1977'} +{'loss': '0.2921', 'grad_norm': '1.09', 'learning_rate': '4.998e-05', 'epoch': '0.1629', 'num_input_tokens_seen': 13244090, 'train_runtime': '6699', 'train_tokens_per_second': '1977'} +{'loss': '0.4174', 'grad_norm': '1.006', 'learning_rate': '4.998e-05', 'epoch': '0.1629', 'num_input_tokens_seen': 13246137, 'train_runtime': '6700', 'train_tokens_per_second': '1977'} +{'loss': '0.671', 'grad_norm': '1.14', 'learning_rate': '4.998e-05', 'epoch': '0.163', 'num_input_tokens_seen': 13248184, 'train_runtime': '6701', 'train_tokens_per_second': '1977'} +{'loss': '2.186', 'grad_norm': '2.781', 'learning_rate': '4.998e-05', 'epoch': '0.163', 'num_input_tokens_seen': 13250231, 'train_runtime': '6702', 'train_tokens_per_second': '1977'} +{'loss': '0.6039', 'grad_norm': '1.159', 'learning_rate': '4.998e-05', 'epoch': '0.163', 'num_input_tokens_seen': 13252278, 'train_runtime': '6703', 'train_tokens_per_second': '1977'} +{'loss': '1.125', 'grad_norm': '1.301', 'learning_rate': '4.998e-05', 'epoch': '0.163', 'num_input_tokens_seen': 13254325, 'train_runtime': '6704', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '1.754', 'learning_rate': '4.998e-05', 'epoch': '0.1631', 'num_input_tokens_seen': 13256372, 'train_runtime': '6705', 'train_tokens_per_second': '1977'} +{'loss': '2.056', 'grad_norm': '2.733', 'learning_rate': '4.998e-05', 'epoch': '0.1631', 'num_input_tokens_seen': 13258419, 'train_runtime': '6706', 'train_tokens_per_second': '1977'} +{'loss': '0.7795', 'grad_norm': '1.395', 'learning_rate': '4.998e-05', 'epoch': '0.1631', 'num_input_tokens_seen': 13260466, 'train_runtime': '6707', 'train_tokens_per_second': '1977'} +{'loss': '1.671', 'grad_norm': '1.985', 'learning_rate': '4.998e-05', 'epoch': '0.1631', 'num_input_tokens_seen': 13262513, 'train_runtime': '6708', 'train_tokens_per_second': '1977'} +{'loss': '0.6684', 'grad_norm': '1.48', 'learning_rate': '4.998e-05', 'epoch': '0.1632', 'num_input_tokens_seen': 13264560, 'train_runtime': '6709', 'train_tokens_per_second': '1977'} +{'loss': '0.5345', 'grad_norm': '1.179', 'learning_rate': '4.998e-05', 'epoch': '0.1632', 'num_input_tokens_seen': 13266607, 'train_runtime': '6710', 'train_tokens_per_second': '1977'} +{'loss': '1.939', 'grad_norm': '3.136', 'learning_rate': '4.998e-05', 'epoch': '0.1632', 'num_input_tokens_seen': 13268654, 'train_runtime': '6711', 'train_tokens_per_second': '1977'} +{'loss': '1.442', 'grad_norm': '1.987', 'learning_rate': '4.998e-05', 'epoch': '0.1632', 'num_input_tokens_seen': 13270701, 'train_runtime': '6712', 'train_tokens_per_second': '1977'} +{'loss': '0.2699', 'grad_norm': '1.111', 'learning_rate': '4.998e-05', 'epoch': '0.1633', 'num_input_tokens_seen': 13272748, 'train_runtime': '6713', 'train_tokens_per_second': '1977'} +{'loss': '0.8123', 'grad_norm': '1.331', 'learning_rate': '4.998e-05', 'epoch': '0.1633', 'num_input_tokens_seen': 13274795, 'train_runtime': '6715', 'train_tokens_per_second': '1977'} +{'loss': '0.2661', 'grad_norm': '0.9893', 'learning_rate': '4.998e-05', 'epoch': '0.1633', 'num_input_tokens_seen': 13276842, 'train_runtime': '6716', 'train_tokens_per_second': '1977'} +{'loss': '0.7108', 'grad_norm': '1.181', 'learning_rate': '4.998e-05', 'epoch': '0.1633', 'num_input_tokens_seen': 13278889, 'train_runtime': '6717', 'train_tokens_per_second': '1977'} +{'loss': '0.9989', 'grad_norm': '1.579', 'learning_rate': '4.998e-05', 'epoch': '0.1634', 'num_input_tokens_seen': 13280936, 'train_runtime': '6718', 'train_tokens_per_second': '1977'} +{'loss': '0.646', 'grad_norm': '1.417', 'learning_rate': '4.998e-05', 'epoch': '0.1634', 'num_input_tokens_seen': 13282983, 'train_runtime': '6719', 'train_tokens_per_second': '1977'} +{'loss': '0.8026', 'grad_norm': '1.335', 'learning_rate': '4.998e-05', 'epoch': '0.1634', 'num_input_tokens_seen': 13285030, 'train_runtime': '6720', 'train_tokens_per_second': '1977'} +{'loss': '0.5663', 'grad_norm': '1.232', 'learning_rate': '4.998e-05', 'epoch': '0.1634', 'num_input_tokens_seen': 13287077, 'train_runtime': '6721', 'train_tokens_per_second': '1977'} +{'loss': '0.7461', 'grad_norm': '1.505', 'learning_rate': '4.998e-05', 'epoch': '0.1635', 'num_input_tokens_seen': 13289124, 'train_runtime': '6722', 'train_tokens_per_second': '1977'} +{'loss': '0.9309', 'grad_norm': '1.767', 'learning_rate': '4.998e-05', 'epoch': '0.1635', 'num_input_tokens_seen': 13291171, 'train_runtime': '6723', 'train_tokens_per_second': '1977'} +{'loss': '1.255', 'grad_norm': '1.797', 'learning_rate': '4.998e-05', 'epoch': '0.1635', 'num_input_tokens_seen': 13293218, 'train_runtime': '6724', 'train_tokens_per_second': '1977'} +{'loss': '1.135', 'grad_norm': '2.085', 'learning_rate': '4.998e-05', 'epoch': '0.1635', 'num_input_tokens_seen': 13295265, 'train_runtime': '6725', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '2.247', 'learning_rate': '4.998e-05', 'epoch': '0.1636', 'num_input_tokens_seen': 13297312, 'train_runtime': '6726', 'train_tokens_per_second': '1977'} +{'loss': '1.134', 'grad_norm': '2.139', 'learning_rate': '4.998e-05', 'epoch': '0.1636', 'num_input_tokens_seen': 13299359, 'train_runtime': '6727', 'train_tokens_per_second': '1977'} +{'loss': '1.051', 'grad_norm': '1.646', 'learning_rate': '4.998e-05', 'epoch': '0.1636', 'num_input_tokens_seen': 13301406, 'train_runtime': '6728', 'train_tokens_per_second': '1977'} +{'loss': '1.074', 'grad_norm': '1.905', 'learning_rate': '4.998e-05', 'epoch': '0.1636', 'num_input_tokens_seen': 13303453, 'train_runtime': '6729', 'train_tokens_per_second': '1977'} +{'loss': '0.5446', 'grad_norm': '1.121', 'learning_rate': '4.998e-05', 'epoch': '0.1637', 'num_input_tokens_seen': 13305500, 'train_runtime': '6730', 'train_tokens_per_second': '1977'} +{'loss': '0.4229', 'grad_norm': '1.306', 'learning_rate': '4.998e-05', 'epoch': '0.1637', 'num_input_tokens_seen': 13307547, 'train_runtime': '6731', 'train_tokens_per_second': '1977'} +{'loss': '0.9984', 'grad_norm': '1.605', 'learning_rate': '4.998e-05', 'epoch': '0.1637', 'num_input_tokens_seen': 13309594, 'train_runtime': '6732', 'train_tokens_per_second': '1977'} +{'loss': '0.3944', 'grad_norm': '1.536', 'learning_rate': '4.998e-05', 'epoch': '0.1637', 'num_input_tokens_seen': 13311641, 'train_runtime': '6733', 'train_tokens_per_second': '1977'} +{'loss': '0.6822', 'grad_norm': '1.72', 'learning_rate': '4.998e-05', 'epoch': '0.1638', 'num_input_tokens_seen': 13313688, 'train_runtime': '6734', 'train_tokens_per_second': '1977'} +{'loss': '0.288', 'grad_norm': '1.009', 'learning_rate': '4.998e-05', 'epoch': '0.1638', 'num_input_tokens_seen': 13315735, 'train_runtime': '6735', 'train_tokens_per_second': '1977'} +{'loss': '0.622', 'grad_norm': '1.868', 'learning_rate': '4.998e-05', 'epoch': '0.1638', 'num_input_tokens_seen': 13317782, 'train_runtime': '6736', 'train_tokens_per_second': '1977'} +{'loss': '1.65', 'grad_norm': '2.795', 'learning_rate': '4.998e-05', 'epoch': '0.1638', 'num_input_tokens_seen': 13319829, 'train_runtime': '6737', 'train_tokens_per_second': '1977'} +{'loss': '0.9504', 'grad_norm': '1.761', 'learning_rate': '4.998e-05', 'epoch': '0.1639', 'num_input_tokens_seen': 13321876, 'train_runtime': '6738', 'train_tokens_per_second': '1977'} +{'loss': '0.6635', 'grad_norm': '1.491', 'learning_rate': '4.998e-05', 'epoch': '0.1639', 'num_input_tokens_seen': 13323923, 'train_runtime': '6739', 'train_tokens_per_second': '1977'} +{'loss': '0.3764', 'grad_norm': '1.131', 'learning_rate': '4.998e-05', 'epoch': '0.1639', 'num_input_tokens_seen': 13325970, 'train_runtime': '6740', 'train_tokens_per_second': '1977'} +{'loss': '1.257', 'grad_norm': '1.827', 'learning_rate': '4.998e-05', 'epoch': '0.1639', 'num_input_tokens_seen': 13328017, 'train_runtime': '6741', 'train_tokens_per_second': '1977'} +{'loss': '0.6011', 'grad_norm': '1.252', 'learning_rate': '4.998e-05', 'epoch': '0.164', 'num_input_tokens_seen': 13330064, 'train_runtime': '6742', 'train_tokens_per_second': '1977'} +{'loss': '1.557', 'grad_norm': '2.013', 'learning_rate': '4.998e-05', 'epoch': '0.164', 'num_input_tokens_seen': 13332111, 'train_runtime': '6744', 'train_tokens_per_second': '1977'} +{'loss': '0.5568', 'grad_norm': '1.197', 'learning_rate': '4.998e-05', 'epoch': '0.164', 'num_input_tokens_seen': 13334158, 'train_runtime': '6745', 'train_tokens_per_second': '1977'} +{'loss': '0.7449', 'grad_norm': '1.682', 'learning_rate': '4.998e-05', 'epoch': '0.164', 'num_input_tokens_seen': 13336205, 'train_runtime': '6746', 'train_tokens_per_second': '1977'} +{'loss': '0.3997', 'grad_norm': '1.037', 'learning_rate': '4.998e-05', 'epoch': '0.1641', 'num_input_tokens_seen': 13338252, 'train_runtime': '6747', 'train_tokens_per_second': '1977'} +{'loss': '1.506', 'grad_norm': '1.976', 'learning_rate': '4.998e-05', 'epoch': '0.1641', 'num_input_tokens_seen': 13340299, 'train_runtime': '6748', 'train_tokens_per_second': '1977'} +{'loss': '0.321', 'grad_norm': '0.8257', 'learning_rate': '4.998e-05', 'epoch': '0.1641', 'num_input_tokens_seen': 13342346, 'train_runtime': '6749', 'train_tokens_per_second': '1977'} +{'loss': '1.122', 'grad_norm': '1.844', 'learning_rate': '4.998e-05', 'epoch': '0.1641', 'num_input_tokens_seen': 13344393, 'train_runtime': '6750', 'train_tokens_per_second': '1977'} +{'loss': '0.792', 'grad_norm': '1.438', 'learning_rate': '4.998e-05', 'epoch': '0.1642', 'num_input_tokens_seen': 13346440, 'train_runtime': '6751', 'train_tokens_per_second': '1977'} +{'loss': '0.3482', 'grad_norm': '0.9702', 'learning_rate': '4.998e-05', 'epoch': '0.1642', 'num_input_tokens_seen': 13348487, 'train_runtime': '6752', 'train_tokens_per_second': '1977'} +{'loss': '0.4442', 'grad_norm': '1.003', 'learning_rate': '4.998e-05', 'epoch': '0.1642', 'num_input_tokens_seen': 13350534, 'train_runtime': '6753', 'train_tokens_per_second': '1977'} +{'loss': '0.3603', 'grad_norm': '0.853', 'learning_rate': '4.998e-05', 'epoch': '0.1642', 'num_input_tokens_seen': 13352581, 'train_runtime': '6754', 'train_tokens_per_second': '1977'} +{'loss': '0.3936', 'grad_norm': '0.9928', 'learning_rate': '4.998e-05', 'epoch': '0.1643', 'num_input_tokens_seen': 13354628, 'train_runtime': '6755', 'train_tokens_per_second': '1977'} +{'loss': '1.107', 'grad_norm': '1.599', 'learning_rate': '4.998e-05', 'epoch': '0.1643', 'num_input_tokens_seen': 13356675, 'train_runtime': '6756', 'train_tokens_per_second': '1977'} +{'loss': '0.6286', 'grad_norm': '1.2', 'learning_rate': '4.998e-05', 'epoch': '0.1643', 'num_input_tokens_seen': 13358722, 'train_runtime': '6757', 'train_tokens_per_second': '1977'} +{'loss': '0.8489', 'grad_norm': '1.517', 'learning_rate': '4.998e-05', 'epoch': '0.1643', 'num_input_tokens_seen': 13360769, 'train_runtime': '6758', 'train_tokens_per_second': '1977'} +{'loss': '0.5894', 'grad_norm': '1.301', 'learning_rate': '4.998e-05', 'epoch': '0.1644', 'num_input_tokens_seen': 13362816, 'train_runtime': '6759', 'train_tokens_per_second': '1977'} +{'loss': '0.4496', 'grad_norm': '1.298', 'learning_rate': '4.998e-05', 'epoch': '0.1644', 'num_input_tokens_seen': 13364863, 'train_runtime': '6760', 'train_tokens_per_second': '1977'} +{'loss': '0.2912', 'grad_norm': '1.073', 'learning_rate': '4.998e-05', 'epoch': '0.1644', 'num_input_tokens_seen': 13366910, 'train_runtime': '6761', 'train_tokens_per_second': '1977'} +{'loss': '1.319', 'grad_norm': '2.255', 'learning_rate': '4.998e-05', 'epoch': '0.1644', 'num_input_tokens_seen': 13368957, 'train_runtime': '6762', 'train_tokens_per_second': '1977'} +{'loss': '0.8179', 'grad_norm': '1.579', 'learning_rate': '4.998e-05', 'epoch': '0.1645', 'num_input_tokens_seen': 13371004, 'train_runtime': '6763', 'train_tokens_per_second': '1977'} +{'loss': '1.311', 'grad_norm': '2.047', 'learning_rate': '4.998e-05', 'epoch': '0.1645', 'num_input_tokens_seen': 13373051, 'train_runtime': '6764', 'train_tokens_per_second': '1977'} +{'loss': '0.7527', 'grad_norm': '1.111', 'learning_rate': '4.998e-05', 'epoch': '0.1645', 'num_input_tokens_seen': 13375098, 'train_runtime': '6765', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '1.935', 'learning_rate': '4.998e-05', 'epoch': '0.1645', 'num_input_tokens_seen': 13377145, 'train_runtime': '6766', 'train_tokens_per_second': '1977'} +{'loss': '0.9317', 'grad_norm': '1.621', 'learning_rate': '4.998e-05', 'epoch': '0.1646', 'num_input_tokens_seen': 13379192, 'train_runtime': '6767', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '1.644', 'learning_rate': '4.998e-05', 'epoch': '0.1646', 'num_input_tokens_seen': 13381239, 'train_runtime': '6768', 'train_tokens_per_second': '1977'} +{'loss': '0.2693', 'grad_norm': '1.081', 'learning_rate': '4.998e-05', 'epoch': '0.1646', 'num_input_tokens_seen': 13383286, 'train_runtime': '6769', 'train_tokens_per_second': '1977'} +{'loss': '0.5326', 'grad_norm': '1.123', 'learning_rate': '4.998e-05', 'epoch': '0.1646', 'num_input_tokens_seen': 13385333, 'train_runtime': '6770', 'train_tokens_per_second': '1977'} +{'loss': '0.3142', 'grad_norm': '1.066', 'learning_rate': '4.998e-05', 'epoch': '0.1647', 'num_input_tokens_seen': 13387380, 'train_runtime': '6771', 'train_tokens_per_second': '1977'} +{'loss': '0.3295', 'grad_norm': '0.854', 'learning_rate': '4.998e-05', 'epoch': '0.1647', 'num_input_tokens_seen': 13389427, 'train_runtime': '6773', 'train_tokens_per_second': '1977'} +{'loss': '0.5538', 'grad_norm': '1.319', 'learning_rate': '4.998e-05', 'epoch': '0.1647', 'num_input_tokens_seen': 13391474, 'train_runtime': '6774', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '1.675', 'learning_rate': '4.998e-05', 'epoch': '0.1647', 'num_input_tokens_seen': 13393521, 'train_runtime': '6775', 'train_tokens_per_second': '1977'} +{'loss': '1.012', 'grad_norm': '1.336', 'learning_rate': '4.998e-05', 'epoch': '0.1648', 'num_input_tokens_seen': 13395568, 'train_runtime': '6776', 'train_tokens_per_second': '1977'} +{'loss': '0.2597', 'grad_norm': '0.9751', 'learning_rate': '4.998e-05', 'epoch': '0.1648', 'num_input_tokens_seen': 13397615, 'train_runtime': '6777', 'train_tokens_per_second': '1977'} +{'loss': '1.538', 'grad_norm': '2.144', 'learning_rate': '4.998e-05', 'epoch': '0.1648', 'num_input_tokens_seen': 13399662, 'train_runtime': '6778', 'train_tokens_per_second': '1977'} +{'loss': '0.7669', 'grad_norm': '1.61', 'learning_rate': '4.998e-05', 'epoch': '0.1648', 'num_input_tokens_seen': 13401709, 'train_runtime': '6779', 'train_tokens_per_second': '1977'} +{'loss': '0.7083', 'grad_norm': '1.571', 'learning_rate': '4.998e-05', 'epoch': '0.1649', 'num_input_tokens_seen': 13403756, 'train_runtime': '6780', 'train_tokens_per_second': '1977'} +{'loss': '0.6888', 'grad_norm': '1.58', 'learning_rate': '4.998e-05', 'epoch': '0.1649', 'num_input_tokens_seen': 13405803, 'train_runtime': '6781', 'train_tokens_per_second': '1977'} +{'loss': '0.8893', 'grad_norm': '1.371', 'learning_rate': '4.998e-05', 'epoch': '0.1649', 'num_input_tokens_seen': 13407850, 'train_runtime': '6782', 'train_tokens_per_second': '1977'} +{'loss': '0.2828', 'grad_norm': '0.9799', 'learning_rate': '4.998e-05', 'epoch': '0.1649', 'num_input_tokens_seen': 13409897, 'train_runtime': '6783', 'train_tokens_per_second': '1977'} +{'loss': '0.3867', 'grad_norm': '0.949', 'learning_rate': '4.998e-05', 'epoch': '0.165', 'num_input_tokens_seen': 13411944, 'train_runtime': '6784', 'train_tokens_per_second': '1977'} +{'loss': '1.32', 'grad_norm': '1.77', 'learning_rate': '4.998e-05', 'epoch': '0.165', 'num_input_tokens_seen': 13413991, 'train_runtime': '6785', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '1.422', 'learning_rate': '4.998e-05', 'epoch': '0.165', 'num_input_tokens_seen': 13416038, 'train_runtime': '6786', 'train_tokens_per_second': '1977'} +{'loss': '0.7513', 'grad_norm': '1.522', 'learning_rate': '4.998e-05', 'epoch': '0.165', 'num_input_tokens_seen': 13418085, 'train_runtime': '6787', 'train_tokens_per_second': '1977'} +{'loss': '0.3905', 'grad_norm': '1.023', 'learning_rate': '4.998e-05', 'epoch': '0.1651', 'num_input_tokens_seen': 13420132, 'train_runtime': '6788', 'train_tokens_per_second': '1977'} +{'loss': '0.3248', 'grad_norm': '1.257', 'learning_rate': '4.998e-05', 'epoch': '0.1651', 'num_input_tokens_seen': 13422179, 'train_runtime': '6789', 'train_tokens_per_second': '1977'} +{'loss': '1.3', 'grad_norm': '2.171', 'learning_rate': '4.998e-05', 'epoch': '0.1651', 'num_input_tokens_seen': 13424226, 'train_runtime': '6790', 'train_tokens_per_second': '1977'} +{'loss': '0.707', 'grad_norm': '1.63', 'learning_rate': '4.998e-05', 'epoch': '0.1651', 'num_input_tokens_seen': 13426273, 'train_runtime': '6791', 'train_tokens_per_second': '1977'} +{'loss': '0.774', 'grad_norm': '1.545', 'learning_rate': '4.998e-05', 'epoch': '0.1652', 'num_input_tokens_seen': 13428320, 'train_runtime': '6792', 'train_tokens_per_second': '1977'} +{'loss': '0.4057', 'grad_norm': '0.9926', 'learning_rate': '4.998e-05', 'epoch': '0.1652', 'num_input_tokens_seen': 13430367, 'train_runtime': '6793', 'train_tokens_per_second': '1977'} +{'loss': '0.4297', 'grad_norm': '0.9974', 'learning_rate': '4.998e-05', 'epoch': '0.1652', 'num_input_tokens_seen': 13432414, 'train_runtime': '6794', 'train_tokens_per_second': '1977'} +{'loss': '0.6972', 'grad_norm': '1.469', 'learning_rate': '4.998e-05', 'epoch': '0.1652', 'num_input_tokens_seen': 13434461, 'train_runtime': '6795', 'train_tokens_per_second': '1977'} +{'loss': '1.312', 'grad_norm': '1.914', 'learning_rate': '4.998e-05', 'epoch': '0.1653', 'num_input_tokens_seen': 13436508, 'train_runtime': '6796', 'train_tokens_per_second': '1977'} +{'loss': '0.2686', 'grad_norm': '1.058', 'learning_rate': '4.998e-05', 'epoch': '0.1653', 'num_input_tokens_seen': 13438555, 'train_runtime': '6797', 'train_tokens_per_second': '1977'} +{'loss': '1.171', 'grad_norm': '1.754', 'learning_rate': '4.998e-05', 'epoch': '0.1653', 'num_input_tokens_seen': 13440602, 'train_runtime': '6798', 'train_tokens_per_second': '1977'} +{'loss': '0.9468', 'grad_norm': '1.542', 'learning_rate': '4.998e-05', 'epoch': '0.1653', 'num_input_tokens_seen': 13442649, 'train_runtime': '6799', 'train_tokens_per_second': '1977'} +{'loss': '0.4444', 'grad_norm': '0.9031', 'learning_rate': '4.998e-05', 'epoch': '0.1654', 'num_input_tokens_seen': 13444696, 'train_runtime': '6800', 'train_tokens_per_second': '1977'} +{'loss': '0.776', 'grad_norm': '1.568', 'learning_rate': '4.998e-05', 'epoch': '0.1654', 'num_input_tokens_seen': 13446743, 'train_runtime': '6801', 'train_tokens_per_second': '1977'} +{'loss': '0.2634', 'grad_norm': '0.9877', 'learning_rate': '4.998e-05', 'epoch': '0.1654', 'num_input_tokens_seen': 13448790, 'train_runtime': '6802', 'train_tokens_per_second': '1977'} +{'loss': '0.9422', 'grad_norm': '1.579', 'learning_rate': '4.998e-05', 'epoch': '0.1654', 'num_input_tokens_seen': 13450837, 'train_runtime': '6804', 'train_tokens_per_second': '1977'} +{'loss': '0.3667', 'grad_norm': '0.9184', 'learning_rate': '4.998e-05', 'epoch': '0.1655', 'num_input_tokens_seen': 13452884, 'train_runtime': '6805', 'train_tokens_per_second': '1977'} +{'loss': '0.4396', 'grad_norm': '1.241', 'learning_rate': '4.998e-05', 'epoch': '0.1655', 'num_input_tokens_seen': 13454931, 'train_runtime': '6806', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.054', 'learning_rate': '4.998e-05', 'epoch': '0.1655', 'num_input_tokens_seen': 13456978, 'train_runtime': '6807', 'train_tokens_per_second': '1977'} +{'loss': '0.8862', 'grad_norm': '1.223', 'learning_rate': '4.998e-05', 'epoch': '0.1655', 'num_input_tokens_seen': 13459025, 'train_runtime': '6808', 'train_tokens_per_second': '1977'} +{'loss': '1.845', 'grad_norm': '2.553', 'learning_rate': '4.998e-05', 'epoch': '0.1656', 'num_input_tokens_seen': 13461072, 'train_runtime': '6809', 'train_tokens_per_second': '1977'} +{'loss': '0.7777', 'grad_norm': '1.812', 'learning_rate': '4.998e-05', 'epoch': '0.1656', 'num_input_tokens_seen': 13463119, 'train_runtime': '6810', 'train_tokens_per_second': '1977'} +{'loss': '0.8657', 'grad_norm': '1.406', 'learning_rate': '4.998e-05', 'epoch': '0.1656', 'num_input_tokens_seen': 13465166, 'train_runtime': '6811', 'train_tokens_per_second': '1977'} +{'loss': '1.528', 'grad_norm': '2.177', 'learning_rate': '4.998e-05', 'epoch': '0.1656', 'num_input_tokens_seen': 13467213, 'train_runtime': '6812', 'train_tokens_per_second': '1977'} +{'loss': '1.472', 'grad_norm': '2.544', 'learning_rate': '4.998e-05', 'epoch': '0.1657', 'num_input_tokens_seen': 13469260, 'train_runtime': '6813', 'train_tokens_per_second': '1977'} +{'loss': '0.389', 'grad_norm': '1.142', 'learning_rate': '4.998e-05', 'epoch': '0.1657', 'num_input_tokens_seen': 13471307, 'train_runtime': '6814', 'train_tokens_per_second': '1977'} +{'loss': '2.727', 'grad_norm': '2.542', 'learning_rate': '4.998e-05', 'epoch': '0.1657', 'num_input_tokens_seen': 13473354, 'train_runtime': '6815', 'train_tokens_per_second': '1977'} +{'loss': '0.7379', 'grad_norm': '1.212', 'learning_rate': '4.998e-05', 'epoch': '0.1657', 'num_input_tokens_seen': 13475401, 'train_runtime': '6816', 'train_tokens_per_second': '1977'} +{'loss': '0.8217', 'grad_norm': '1.62', 'learning_rate': '4.998e-05', 'epoch': '0.1658', 'num_input_tokens_seen': 13477448, 'train_runtime': '6817', 'train_tokens_per_second': '1977'} +{'loss': '0.8135', 'grad_norm': '1.741', 'learning_rate': '4.998e-05', 'epoch': '0.1658', 'num_input_tokens_seen': 13479495, 'train_runtime': '6818', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '1.881', 'learning_rate': '4.998e-05', 'epoch': '0.1658', 'num_input_tokens_seen': 13481542, 'train_runtime': '6819', 'train_tokens_per_second': '1977'} +{'loss': '0.3482', 'grad_norm': '0.8419', 'learning_rate': '4.998e-05', 'epoch': '0.1658', 'num_input_tokens_seen': 13483589, 'train_runtime': '6820', 'train_tokens_per_second': '1977'} +{'loss': '1.263', 'grad_norm': '1.946', 'learning_rate': '4.998e-05', 'epoch': '0.1659', 'num_input_tokens_seen': 13485636, 'train_runtime': '6821', 'train_tokens_per_second': '1977'} +{'loss': '0.4069', 'grad_norm': '1.17', 'learning_rate': '4.998e-05', 'epoch': '0.1659', 'num_input_tokens_seen': 13487683, 'train_runtime': '6822', 'train_tokens_per_second': '1977'} +{'loss': '1.895', 'grad_norm': '2.42', 'learning_rate': '4.998e-05', 'epoch': '0.1659', 'num_input_tokens_seen': 13489730, 'train_runtime': '6823', 'train_tokens_per_second': '1977'} +{'loss': '0.6255', 'grad_norm': '1.842', 'learning_rate': '4.998e-05', 'epoch': '0.1659', 'num_input_tokens_seen': 13491777, 'train_runtime': '6824', 'train_tokens_per_second': '1977'} +{'loss': '0.527', 'grad_norm': '1.019', 'learning_rate': '4.998e-05', 'epoch': '0.166', 'num_input_tokens_seen': 13493824, 'train_runtime': '6825', 'train_tokens_per_second': '1977'} +{'loss': '1.996', 'grad_norm': '2.226', 'learning_rate': '4.998e-05', 'epoch': '0.166', 'num_input_tokens_seen': 13495871, 'train_runtime': '6826', 'train_tokens_per_second': '1977'} +{'loss': '1.215', 'grad_norm': '1.917', 'learning_rate': '4.998e-05', 'epoch': '0.166', 'num_input_tokens_seen': 13497918, 'train_runtime': '6827', 'train_tokens_per_second': '1977'} +{'loss': '0.7692', 'grad_norm': '1.25', 'learning_rate': '4.998e-05', 'epoch': '0.166', 'num_input_tokens_seen': 13499965, 'train_runtime': '6828', 'train_tokens_per_second': '1977'} +{'loss': '0.6271', 'grad_norm': '1.63', 'learning_rate': '4.998e-05', 'epoch': '0.1661', 'num_input_tokens_seen': 13502012, 'train_runtime': '6829', 'train_tokens_per_second': '1977'} +{'loss': '1.426', 'grad_norm': '1.985', 'learning_rate': '4.998e-05', 'epoch': '0.1661', 'num_input_tokens_seen': 13504059, 'train_runtime': '6830', 'train_tokens_per_second': '1977'} +{'loss': '0.4668', 'grad_norm': '1.232', 'learning_rate': '4.998e-05', 'epoch': '0.1661', 'num_input_tokens_seen': 13506106, 'train_runtime': '6832', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '2.143', 'learning_rate': '4.998e-05', 'epoch': '0.1662', 'num_input_tokens_seen': 13508153, 'train_runtime': '6833', 'train_tokens_per_second': '1977'} +{'loss': '0.3801', 'grad_norm': '0.9643', 'learning_rate': '4.998e-05', 'epoch': '0.1662', 'num_input_tokens_seen': 13510200, 'train_runtime': '6834', 'train_tokens_per_second': '1977'} +{'loss': '0.425', 'grad_norm': '1.286', 'learning_rate': '4.998e-05', 'epoch': '0.1662', 'num_input_tokens_seen': 13512247, 'train_runtime': '6835', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '1.279', 'learning_rate': '4.998e-05', 'epoch': '0.1662', 'num_input_tokens_seen': 13514294, 'train_runtime': '6836', 'train_tokens_per_second': '1977'} +{'loss': '0.7027', 'grad_norm': '1.574', 'learning_rate': '4.998e-05', 'epoch': '0.1663', 'num_input_tokens_seen': 13516341, 'train_runtime': '6837', 'train_tokens_per_second': '1977'} +{'loss': '0.694', 'grad_norm': '1.17', 'learning_rate': '4.998e-05', 'epoch': '0.1663', 'num_input_tokens_seen': 13518388, 'train_runtime': '6838', 'train_tokens_per_second': '1977'} +{'loss': '1.312', 'grad_norm': '1.724', 'learning_rate': '4.998e-05', 'epoch': '0.1663', 'num_input_tokens_seen': 13520435, 'train_runtime': '6839', 'train_tokens_per_second': '1977'} +{'loss': '0.9927', 'grad_norm': '1.545', 'learning_rate': '4.998e-05', 'epoch': '0.1663', 'num_input_tokens_seen': 13522482, 'train_runtime': '6840', 'train_tokens_per_second': '1977'} +{'loss': '0.3935', 'grad_norm': '0.9688', 'learning_rate': '4.998e-05', 'epoch': '0.1664', 'num_input_tokens_seen': 13524529, 'train_runtime': '6841', 'train_tokens_per_second': '1977'} +{'loss': '1.418', 'grad_norm': '2.238', 'learning_rate': '4.998e-05', 'epoch': '0.1664', 'num_input_tokens_seen': 13526576, 'train_runtime': '6842', 'train_tokens_per_second': '1977'} +{'loss': '1.52', 'grad_norm': '2.184', 'learning_rate': '4.998e-05', 'epoch': '0.1664', 'num_input_tokens_seen': 13528623, 'train_runtime': '6843', 'train_tokens_per_second': '1977'} +{'loss': '0.5154', 'grad_norm': '1.146', 'learning_rate': '4.998e-05', 'epoch': '0.1664', 'num_input_tokens_seen': 13530670, 'train_runtime': '6844', 'train_tokens_per_second': '1977'} +{'loss': '0.2525', 'grad_norm': '0.8801', 'learning_rate': '4.998e-05', 'epoch': '0.1665', 'num_input_tokens_seen': 13532717, 'train_runtime': '6845', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '1.577', 'learning_rate': '4.998e-05', 'epoch': '0.1665', 'num_input_tokens_seen': 13534764, 'train_runtime': '6846', 'train_tokens_per_second': '1977'} +{'loss': '0.7106', 'grad_norm': '1.369', 'learning_rate': '4.998e-05', 'epoch': '0.1665', 'num_input_tokens_seen': 13536811, 'train_runtime': '6847', 'train_tokens_per_second': '1977'} +{'loss': '2.578', 'grad_norm': '2.445', 'learning_rate': '4.998e-05', 'epoch': '0.1665', 'num_input_tokens_seen': 13538858, 'train_runtime': '6848', 'train_tokens_per_second': '1977'} +{'loss': '0.7776', 'grad_norm': '1.067', 'learning_rate': '4.998e-05', 'epoch': '0.1666', 'num_input_tokens_seen': 13540905, 'train_runtime': '6849', 'train_tokens_per_second': '1977'} +{'loss': '0.7881', 'grad_norm': '1.009', 'learning_rate': '4.998e-05', 'epoch': '0.1666', 'num_input_tokens_seen': 13542952, 'train_runtime': '6850', 'train_tokens_per_second': '1977'} +{'loss': '0.2836', 'grad_norm': '0.9461', 'learning_rate': '4.998e-05', 'epoch': '0.1666', 'num_input_tokens_seen': 13544999, 'train_runtime': '6851', 'train_tokens_per_second': '1977'} +{'loss': '0.7385', 'grad_norm': '1.213', 'learning_rate': '4.998e-05', 'epoch': '0.1666', 'num_input_tokens_seen': 13547046, 'train_runtime': '6852', 'train_tokens_per_second': '1977'} +{'loss': '0.5278', 'grad_norm': '1.392', 'learning_rate': '4.998e-05', 'epoch': '0.1667', 'num_input_tokens_seen': 13549093, 'train_runtime': '6853', 'train_tokens_per_second': '1977'} +{'loss': '0.9205', 'grad_norm': '1.381', 'learning_rate': '4.998e-05', 'epoch': '0.1667', 'num_input_tokens_seen': 13551140, 'train_runtime': '6854', 'train_tokens_per_second': '1977'} +{'loss': '0.7425', 'grad_norm': '1.498', 'learning_rate': '4.998e-05', 'epoch': '0.1667', 'num_input_tokens_seen': 13553187, 'train_runtime': '6855', 'train_tokens_per_second': '1977'} +{'loss': '0.479', 'grad_norm': '1.366', 'learning_rate': '4.998e-05', 'epoch': '0.1667', 'num_input_tokens_seen': 13555234, 'train_runtime': '6856', 'train_tokens_per_second': '1977'} +{'loss': '0.5231', 'grad_norm': '1.244', 'learning_rate': '4.998e-05', 'epoch': '0.1668', 'num_input_tokens_seen': 13557281, 'train_runtime': '6857', 'train_tokens_per_second': '1977'} +{'loss': '0.6974', 'grad_norm': '1.062', 'learning_rate': '4.998e-05', 'epoch': '0.1668', 'num_input_tokens_seen': 13559328, 'train_runtime': '6858', 'train_tokens_per_second': '1977'} +{'loss': '0.5261', 'grad_norm': '1.043', 'learning_rate': '4.998e-05', 'epoch': '0.1668', 'num_input_tokens_seen': 13561375, 'train_runtime': '6859', 'train_tokens_per_second': '1977'} +{'loss': '0.7244', 'grad_norm': '1.671', 'learning_rate': '4.998e-05', 'epoch': '0.1668', 'num_input_tokens_seen': 13563422, 'train_runtime': '6861', 'train_tokens_per_second': '1977'} +{'loss': '0.5647', 'grad_norm': '1.153', 'learning_rate': '4.998e-05', 'epoch': '0.1669', 'num_input_tokens_seen': 13565469, 'train_runtime': '6862', 'train_tokens_per_second': '1977'} +{'loss': '0.521', 'grad_norm': '1.235', 'learning_rate': '4.998e-05', 'epoch': '0.1669', 'num_input_tokens_seen': 13567516, 'train_runtime': '6863', 'train_tokens_per_second': '1977'} +{'loss': '0.279', 'grad_norm': '1.025', 'learning_rate': '4.998e-05', 'epoch': '0.1669', 'num_input_tokens_seen': 13569563, 'train_runtime': '6864', 'train_tokens_per_second': '1977'} +{'loss': '0.4185', 'grad_norm': '0.9888', 'learning_rate': '4.998e-05', 'epoch': '0.1669', 'num_input_tokens_seen': 13571610, 'train_runtime': '6865', 'train_tokens_per_second': '1977'} +{'loss': '0.9544', 'grad_norm': '1.822', 'learning_rate': '4.998e-05', 'epoch': '0.167', 'num_input_tokens_seen': 13573657, 'train_runtime': '6866', 'train_tokens_per_second': '1977'} +{'loss': '0.4354', 'grad_norm': '0.9598', 'learning_rate': '4.998e-05', 'epoch': '0.167', 'num_input_tokens_seen': 13575704, 'train_runtime': '6867', 'train_tokens_per_second': '1977'} +{'loss': '0.7256', 'grad_norm': '1.296', 'learning_rate': '4.998e-05', 'epoch': '0.167', 'num_input_tokens_seen': 13577751, 'train_runtime': '6868', 'train_tokens_per_second': '1977'} +{'loss': '0.6485', 'grad_norm': '2.135', 'learning_rate': '4.998e-05', 'epoch': '0.167', 'num_input_tokens_seen': 13579798, 'train_runtime': '6869', 'train_tokens_per_second': '1977'} +{'loss': '0.616', 'grad_norm': '1.371', 'learning_rate': '4.998e-05', 'epoch': '0.1671', 'num_input_tokens_seen': 13581845, 'train_runtime': '6870', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '1.494', 'learning_rate': '4.998e-05', 'epoch': '0.1671', 'num_input_tokens_seen': 13583892, 'train_runtime': '6871', 'train_tokens_per_second': '1977'} +{'loss': '1.876', 'grad_norm': '3.075', 'learning_rate': '4.998e-05', 'epoch': '0.1671', 'num_input_tokens_seen': 13585939, 'train_runtime': '6872', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '2.145', 'learning_rate': '4.998e-05', 'epoch': '0.1671', 'num_input_tokens_seen': 13587986, 'train_runtime': '6873', 'train_tokens_per_second': '1977'} +{'loss': '0.9172', 'grad_norm': '1.6', 'learning_rate': '4.998e-05', 'epoch': '0.1672', 'num_input_tokens_seen': 13590033, 'train_runtime': '6874', 'train_tokens_per_second': '1977'} +{'loss': '0.8248', 'grad_norm': '1.298', 'learning_rate': '4.998e-05', 'epoch': '0.1672', 'num_input_tokens_seen': 13592080, 'train_runtime': '6875', 'train_tokens_per_second': '1977'} +{'loss': '0.3272', 'grad_norm': '0.9605', 'learning_rate': '4.998e-05', 'epoch': '0.1672', 'num_input_tokens_seen': 13594127, 'train_runtime': '6876', 'train_tokens_per_second': '1977'} +{'loss': '1.249', 'grad_norm': '2.048', 'learning_rate': '4.998e-05', 'epoch': '0.1672', 'num_input_tokens_seen': 13596174, 'train_runtime': '6877', 'train_tokens_per_second': '1977'} +{'loss': '0.965', 'grad_norm': '1.829', 'learning_rate': '4.998e-05', 'epoch': '0.1673', 'num_input_tokens_seen': 13598221, 'train_runtime': '6878', 'train_tokens_per_second': '1977'} +{'loss': '0.721', 'grad_norm': '1.397', 'learning_rate': '4.998e-05', 'epoch': '0.1673', 'num_input_tokens_seen': 13600268, 'train_runtime': '6879', 'train_tokens_per_second': '1977'} +{'loss': '1.25', 'grad_norm': '2.168', 'learning_rate': '4.998e-05', 'epoch': '0.1673', 'num_input_tokens_seen': 13602315, 'train_runtime': '6880', 'train_tokens_per_second': '1977'} +{'loss': '1.347', 'grad_norm': '1.923', 'learning_rate': '4.998e-05', 'epoch': '0.1673', 'num_input_tokens_seen': 13604362, 'train_runtime': '6881', 'train_tokens_per_second': '1977'} +{'loss': '1.542', 'grad_norm': '2.152', 'learning_rate': '4.998e-05', 'epoch': '0.1674', 'num_input_tokens_seen': 13606409, 'train_runtime': '6882', 'train_tokens_per_second': '1977'} +{'loss': '1.186', 'grad_norm': '1.947', 'learning_rate': '4.998e-05', 'epoch': '0.1674', 'num_input_tokens_seen': 13608456, 'train_runtime': '6883', 'train_tokens_per_second': '1977'} +{'loss': '0.288', 'grad_norm': '0.9001', 'learning_rate': '4.998e-05', 'epoch': '0.1674', 'num_input_tokens_seen': 13610503, 'train_runtime': '6884', 'train_tokens_per_second': '1977'} +{'loss': '0.7511', 'grad_norm': '1.161', 'learning_rate': '4.998e-05', 'epoch': '0.1674', 'num_input_tokens_seen': 13612550, 'train_runtime': '6885', 'train_tokens_per_second': '1977'} +{'loss': '0.7323', 'grad_norm': '1.526', 'learning_rate': '4.998e-05', 'epoch': '0.1675', 'num_input_tokens_seen': 13614597, 'train_runtime': '6886', 'train_tokens_per_second': '1977'} +{'loss': '0.7913', 'grad_norm': '1.307', 'learning_rate': '4.998e-05', 'epoch': '0.1675', 'num_input_tokens_seen': 13616644, 'train_runtime': '6887', 'train_tokens_per_second': '1977'} +{'loss': '2.167', 'grad_norm': '2.857', 'learning_rate': '4.998e-05', 'epoch': '0.1675', 'num_input_tokens_seen': 13618691, 'train_runtime': '6888', 'train_tokens_per_second': '1977'} +{'loss': '0.3295', 'grad_norm': '1.249', 'learning_rate': '4.998e-05', 'epoch': '0.1675', 'num_input_tokens_seen': 13620738, 'train_runtime': '6890', 'train_tokens_per_second': '1977'} +{'loss': '1.683', 'grad_norm': '2.225', 'learning_rate': '4.998e-05', 'epoch': '0.1676', 'num_input_tokens_seen': 13622785, 'train_runtime': '6891', 'train_tokens_per_second': '1977'} +{'loss': '0.5728', 'grad_norm': '1.349', 'learning_rate': '4.998e-05', 'epoch': '0.1676', 'num_input_tokens_seen': 13624832, 'train_runtime': '6892', 'train_tokens_per_second': '1977'} +{'loss': '0.9058', 'grad_norm': '1.169', 'learning_rate': '4.998e-05', 'epoch': '0.1676', 'num_input_tokens_seen': 13626879, 'train_runtime': '6893', 'train_tokens_per_second': '1977'} +{'loss': '0.6032', 'grad_norm': '1.215', 'learning_rate': '4.998e-05', 'epoch': '0.1676', 'num_input_tokens_seen': 13628926, 'train_runtime': '6894', 'train_tokens_per_second': '1977'} +{'loss': '0.8822', 'grad_norm': '1.463', 'learning_rate': '4.998e-05', 'epoch': '0.1677', 'num_input_tokens_seen': 13630973, 'train_runtime': '6895', 'train_tokens_per_second': '1977'} +{'loss': '1.892', 'grad_norm': '2.196', 'learning_rate': '4.998e-05', 'epoch': '0.1677', 'num_input_tokens_seen': 13633020, 'train_runtime': '6896', 'train_tokens_per_second': '1977'} +{'loss': '0.358', 'grad_norm': '0.999', 'learning_rate': '4.998e-05', 'epoch': '0.1677', 'num_input_tokens_seen': 13635067, 'train_runtime': '6897', 'train_tokens_per_second': '1977'} +{'loss': '0.2964', 'grad_norm': '0.9842', 'learning_rate': '4.998e-05', 'epoch': '0.1677', 'num_input_tokens_seen': 13637114, 'train_runtime': '6898', 'train_tokens_per_second': '1977'} +{'loss': '0.4073', 'grad_norm': '1.036', 'learning_rate': '4.998e-05', 'epoch': '0.1678', 'num_input_tokens_seen': 13639161, 'train_runtime': '6899', 'train_tokens_per_second': '1977'} +{'loss': '0.8616', 'grad_norm': '1.464', 'learning_rate': '4.998e-05', 'epoch': '0.1678', 'num_input_tokens_seen': 13641208, 'train_runtime': '6900', 'train_tokens_per_second': '1977'} +{'loss': '0.4908', 'grad_norm': '1.175', 'learning_rate': '4.998e-05', 'epoch': '0.1678', 'num_input_tokens_seen': 13643255, 'train_runtime': '6901', 'train_tokens_per_second': '1977'} +{'loss': '0.4381', 'grad_norm': '1.043', 'learning_rate': '4.998e-05', 'epoch': '0.1678', 'num_input_tokens_seen': 13645302, 'train_runtime': '6902', 'train_tokens_per_second': '1977'} +{'loss': '0.8543', 'grad_norm': '1.817', 'learning_rate': '4.998e-05', 'epoch': '0.1679', 'num_input_tokens_seen': 13647349, 'train_runtime': '6903', 'train_tokens_per_second': '1977'} +{'loss': '2.334', 'grad_norm': '2.54', 'learning_rate': '4.998e-05', 'epoch': '0.1679', 'num_input_tokens_seen': 13649396, 'train_runtime': '6904', 'train_tokens_per_second': '1977'} +{'loss': '1.261', 'grad_norm': '1.357', 'learning_rate': '4.998e-05', 'epoch': '0.1679', 'num_input_tokens_seen': 13651443, 'train_runtime': '6905', 'train_tokens_per_second': '1977'} +{'loss': '0.8097', 'grad_norm': '1.749', 'learning_rate': '4.998e-05', 'epoch': '0.1679', 'num_input_tokens_seen': 13653490, 'train_runtime': '6906', 'train_tokens_per_second': '1977'} +{'loss': '0.9288', 'grad_norm': '1.354', 'learning_rate': '4.998e-05', 'epoch': '0.168', 'num_input_tokens_seen': 13655537, 'train_runtime': '6907', 'train_tokens_per_second': '1977'} +{'loss': '0.7265', 'grad_norm': '1.623', 'learning_rate': '4.998e-05', 'epoch': '0.168', 'num_input_tokens_seen': 13657584, 'train_runtime': '6908', 'train_tokens_per_second': '1977'} +{'loss': '0.5121', 'grad_norm': '1.248', 'learning_rate': '4.998e-05', 'epoch': '0.168', 'num_input_tokens_seen': 13659631, 'train_runtime': '6909', 'train_tokens_per_second': '1977'} +{'loss': '0.3365', 'grad_norm': '0.8133', 'learning_rate': '4.998e-05', 'epoch': '0.168', 'num_input_tokens_seen': 13661678, 'train_runtime': '6910', 'train_tokens_per_second': '1977'} +{'loss': '0.9648', 'grad_norm': '1.579', 'learning_rate': '4.998e-05', 'epoch': '0.1681', 'num_input_tokens_seen': 13663725, 'train_runtime': '6911', 'train_tokens_per_second': '1977'} +{'loss': '0.5296', 'grad_norm': '1.157', 'learning_rate': '4.998e-05', 'epoch': '0.1681', 'num_input_tokens_seen': 13665772, 'train_runtime': '6912', 'train_tokens_per_second': '1977'} +{'loss': '0.3834', 'grad_norm': '0.9172', 'learning_rate': '4.998e-05', 'epoch': '0.1681', 'num_input_tokens_seen': 13667819, 'train_runtime': '6913', 'train_tokens_per_second': '1977'} +{'loss': '0.8864', 'grad_norm': '1.721', 'learning_rate': '4.998e-05', 'epoch': '0.1681', 'num_input_tokens_seen': 13669866, 'train_runtime': '6914', 'train_tokens_per_second': '1977'} +{'loss': '0.6889', 'grad_norm': '1.577', 'learning_rate': '4.998e-05', 'epoch': '0.1682', 'num_input_tokens_seen': 13671913, 'train_runtime': '6915', 'train_tokens_per_second': '1977'} +{'loss': '0.2778', 'grad_norm': '0.8799', 'learning_rate': '4.998e-05', 'epoch': '0.1682', 'num_input_tokens_seen': 13673960, 'train_runtime': '6916', 'train_tokens_per_second': '1977'} +{'loss': '1.681', 'grad_norm': '2.411', 'learning_rate': '4.998e-05', 'epoch': '0.1682', 'num_input_tokens_seen': 13676007, 'train_runtime': '6917', 'train_tokens_per_second': '1977'} +{'loss': '0.9141', 'grad_norm': '1.232', 'learning_rate': '4.998e-05', 'epoch': '0.1682', 'num_input_tokens_seen': 13678054, 'train_runtime': '6918', 'train_tokens_per_second': '1977'} +{'loss': '0.764', 'grad_norm': '1.419', 'learning_rate': '4.998e-05', 'epoch': '0.1683', 'num_input_tokens_seen': 13680101, 'train_runtime': '6920', 'train_tokens_per_second': '1977'} +{'loss': '0.5883', 'grad_norm': '1.412', 'learning_rate': '4.998e-05', 'epoch': '0.1683', 'num_input_tokens_seen': 13682148, 'train_runtime': '6921', 'train_tokens_per_second': '1977'} +{'loss': '1.473', 'grad_norm': '2.032', 'learning_rate': '4.998e-05', 'epoch': '0.1683', 'num_input_tokens_seen': 13684195, 'train_runtime': '6922', 'train_tokens_per_second': '1977'} +{'loss': '0.5388', 'grad_norm': '1.422', 'learning_rate': '4.998e-05', 'epoch': '0.1683', 'num_input_tokens_seen': 13686242, 'train_runtime': '6923', 'train_tokens_per_second': '1977'} +{'loss': '1.022', 'grad_norm': '1.912', 'learning_rate': '4.998e-05', 'epoch': '0.1684', 'num_input_tokens_seen': 13688289, 'train_runtime': '6924', 'train_tokens_per_second': '1977'} +{'loss': '0.6541', 'grad_norm': '1.43', 'learning_rate': '4.998e-05', 'epoch': '0.1684', 'num_input_tokens_seen': 13690336, 'train_runtime': '6925', 'train_tokens_per_second': '1977'} +{'loss': '0.3747', 'grad_norm': '1.054', 'learning_rate': '4.998e-05', 'epoch': '0.1684', 'num_input_tokens_seen': 13692383, 'train_runtime': '6926', 'train_tokens_per_second': '1977'} +{'loss': '0.384', 'grad_norm': '1.119', 'learning_rate': '4.998e-05', 'epoch': '0.1684', 'num_input_tokens_seen': 13694430, 'train_runtime': '6927', 'train_tokens_per_second': '1977'} +{'loss': '0.5179', 'grad_norm': '1.021', 'learning_rate': '4.998e-05', 'epoch': '0.1685', 'num_input_tokens_seen': 13696477, 'train_runtime': '6928', 'train_tokens_per_second': '1977'} +{'loss': '0.7169', 'grad_norm': '1.505', 'learning_rate': '4.998e-05', 'epoch': '0.1685', 'num_input_tokens_seen': 13698524, 'train_runtime': '6929', 'train_tokens_per_second': '1977'} +{'loss': '0.8747', 'grad_norm': '1.493', 'learning_rate': '4.998e-05', 'epoch': '0.1685', 'num_input_tokens_seen': 13700571, 'train_runtime': '6930', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '1.52', 'learning_rate': '4.998e-05', 'epoch': '0.1685', 'num_input_tokens_seen': 13702618, 'train_runtime': '6931', 'train_tokens_per_second': '1977'} +{'loss': '0.7236', 'grad_norm': '1.42', 'learning_rate': '4.998e-05', 'epoch': '0.1686', 'num_input_tokens_seen': 13704665, 'train_runtime': '6932', 'train_tokens_per_second': '1977'} +{'loss': '0.8027', 'grad_norm': '1.541', 'learning_rate': '4.998e-05', 'epoch': '0.1686', 'num_input_tokens_seen': 13706712, 'train_runtime': '6933', 'train_tokens_per_second': '1977'} +{'loss': '0.3779', 'grad_norm': '0.7099', 'learning_rate': '4.998e-05', 'epoch': '0.1686', 'num_input_tokens_seen': 13708759, 'train_runtime': '6934', 'train_tokens_per_second': '1977'} +{'loss': '0.3109', 'grad_norm': '1.1', 'learning_rate': '4.998e-05', 'epoch': '0.1686', 'num_input_tokens_seen': 13710806, 'train_runtime': '6935', 'train_tokens_per_second': '1977'} +{'loss': '0.2927', 'grad_norm': '1.071', 'learning_rate': '4.998e-05', 'epoch': '0.1687', 'num_input_tokens_seen': 13712853, 'train_runtime': '6936', 'train_tokens_per_second': '1977'} +{'loss': '0.9753', 'grad_norm': '1.469', 'learning_rate': '4.998e-05', 'epoch': '0.1687', 'num_input_tokens_seen': 13714900, 'train_runtime': '6937', 'train_tokens_per_second': '1977'} +{'loss': '1.462', 'grad_norm': '1.821', 'learning_rate': '4.998e-05', 'epoch': '0.1687', 'num_input_tokens_seen': 13716947, 'train_runtime': '6938', 'train_tokens_per_second': '1977'} +{'loss': '0.4783', 'grad_norm': '1.237', 'learning_rate': '4.998e-05', 'epoch': '0.1687', 'num_input_tokens_seen': 13718994, 'train_runtime': '6939', 'train_tokens_per_second': '1977'} +{'loss': '0.2938', 'grad_norm': '0.9637', 'learning_rate': '4.998e-05', 'epoch': '0.1688', 'num_input_tokens_seen': 13721041, 'train_runtime': '6940', 'train_tokens_per_second': '1977'} +{'loss': '0.4235', 'grad_norm': '0.8291', 'learning_rate': '4.998e-05', 'epoch': '0.1688', 'num_input_tokens_seen': 13723088, 'train_runtime': '6941', 'train_tokens_per_second': '1977'} +{'loss': '0.3386', 'grad_norm': '1.089', 'learning_rate': '4.998e-05', 'epoch': '0.1688', 'num_input_tokens_seen': 13725135, 'train_runtime': '6942', 'train_tokens_per_second': '1977'} +{'loss': '1.602', 'grad_norm': '2.056', 'learning_rate': '4.998e-05', 'epoch': '0.1688', 'num_input_tokens_seen': 13727182, 'train_runtime': '6943', 'train_tokens_per_second': '1977'} +{'loss': '0.3109', 'grad_norm': '1.157', 'learning_rate': '4.998e-05', 'epoch': '0.1689', 'num_input_tokens_seen': 13729229, 'train_runtime': '6944', 'train_tokens_per_second': '1977'} +{'loss': '0.7086', 'grad_norm': '1.356', 'learning_rate': '4.998e-05', 'epoch': '0.1689', 'num_input_tokens_seen': 13731276, 'train_runtime': '6945', 'train_tokens_per_second': '1977'} +{'loss': '0.2604', 'grad_norm': '1.009', 'learning_rate': '4.998e-05', 'epoch': '0.1689', 'num_input_tokens_seen': 13733323, 'train_runtime': '6946', 'train_tokens_per_second': '1977'} +{'loss': '0.346', 'grad_norm': '0.9688', 'learning_rate': '4.998e-05', 'epoch': '0.1689', 'num_input_tokens_seen': 13735370, 'train_runtime': '6947', 'train_tokens_per_second': '1977'} +{'loss': '0.3947', 'grad_norm': '1.016', 'learning_rate': '4.998e-05', 'epoch': '0.169', 'num_input_tokens_seen': 13737417, 'train_runtime': '6949', 'train_tokens_per_second': '1977'} +{'loss': '0.4397', 'grad_norm': '1.149', 'learning_rate': '4.998e-05', 'epoch': '0.169', 'num_input_tokens_seen': 13739464, 'train_runtime': '6950', 'train_tokens_per_second': '1977'} +{'loss': '1.124', 'grad_norm': '1.839', 'learning_rate': '4.998e-05', 'epoch': '0.169', 'num_input_tokens_seen': 13741511, 'train_runtime': '6951', 'train_tokens_per_second': '1977'} +{'loss': '1.525', 'grad_norm': '2.108', 'learning_rate': '4.998e-05', 'epoch': '0.169', 'num_input_tokens_seen': 13743558, 'train_runtime': '6952', 'train_tokens_per_second': '1977'} +{'loss': '0.4985', 'grad_norm': '1.435', 'learning_rate': '4.998e-05', 'epoch': '0.1691', 'num_input_tokens_seen': 13745605, 'train_runtime': '6953', 'train_tokens_per_second': '1977'} +{'loss': '0.3526', 'grad_norm': '1.012', 'learning_rate': '4.998e-05', 'epoch': '0.1691', 'num_input_tokens_seen': 13747652, 'train_runtime': '6954', 'train_tokens_per_second': '1977'} +{'loss': '1.606', 'grad_norm': '2.455', 'learning_rate': '4.998e-05', 'epoch': '0.1691', 'num_input_tokens_seen': 13749699, 'train_runtime': '6955', 'train_tokens_per_second': '1977'} +{'loss': '0.3103', 'grad_norm': '1.076', 'learning_rate': '4.998e-05', 'epoch': '0.1691', 'num_input_tokens_seen': 13751746, 'train_runtime': '6956', 'train_tokens_per_second': '1977'} +{'loss': '0.5564', 'grad_norm': '1.322', 'learning_rate': '4.998e-05', 'epoch': '0.1692', 'num_input_tokens_seen': 13753793, 'train_runtime': '6957', 'train_tokens_per_second': '1977'} +{'loss': '0.4575', 'grad_norm': '1.003', 'learning_rate': '4.998e-05', 'epoch': '0.1692', 'num_input_tokens_seen': 13755840, 'train_runtime': '6958', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '1.874', 'learning_rate': '4.998e-05', 'epoch': '0.1692', 'num_input_tokens_seen': 13757887, 'train_runtime': '6959', 'train_tokens_per_second': '1977'} +{'loss': '1.869', 'grad_norm': '2.204', 'learning_rate': '4.998e-05', 'epoch': '0.1692', 'num_input_tokens_seen': 13759934, 'train_runtime': '6960', 'train_tokens_per_second': '1977'} +{'loss': '0.5507', 'grad_norm': '1.147', 'learning_rate': '4.998e-05', 'epoch': '0.1693', 'num_input_tokens_seen': 13761981, 'train_runtime': '6961', 'train_tokens_per_second': '1977'} +{'loss': '1.267', 'grad_norm': '2.053', 'learning_rate': '4.998e-05', 'epoch': '0.1693', 'num_input_tokens_seen': 13764028, 'train_runtime': '6962', 'train_tokens_per_second': '1977'} +{'loss': '1.125', 'grad_norm': '1.613', 'learning_rate': '4.998e-05', 'epoch': '0.1693', 'num_input_tokens_seen': 13766075, 'train_runtime': '6963', 'train_tokens_per_second': '1977'} +{'loss': '1.27', 'grad_norm': '1.862', 'learning_rate': '4.998e-05', 'epoch': '0.1693', 'num_input_tokens_seen': 13768122, 'train_runtime': '6964', 'train_tokens_per_second': '1977'} +{'loss': '0.4308', 'grad_norm': '1.021', 'learning_rate': '4.998e-05', 'epoch': '0.1694', 'num_input_tokens_seen': 13770169, 'train_runtime': '6965', 'train_tokens_per_second': '1977'} +{'loss': '0.3541', 'grad_norm': '0.8078', 'learning_rate': '4.998e-05', 'epoch': '0.1694', 'num_input_tokens_seen': 13772216, 'train_runtime': '6966', 'train_tokens_per_second': '1977'} +{'loss': '0.3806', 'grad_norm': '0.9589', 'learning_rate': '4.998e-05', 'epoch': '0.1694', 'num_input_tokens_seen': 13774263, 'train_runtime': '6967', 'train_tokens_per_second': '1977'} +{'loss': '0.5084', 'grad_norm': '1.394', 'learning_rate': '4.998e-05', 'epoch': '0.1694', 'num_input_tokens_seen': 13776310, 'train_runtime': '6968', 'train_tokens_per_second': '1977'} +{'loss': '0.426', 'grad_norm': '1.299', 'learning_rate': '4.998e-05', 'epoch': '0.1695', 'num_input_tokens_seen': 13778357, 'train_runtime': '6969', 'train_tokens_per_second': '1977'} +{'loss': '0.6489', 'grad_norm': '1.415', 'learning_rate': '4.998e-05', 'epoch': '0.1695', 'num_input_tokens_seen': 13780404, 'train_runtime': '6970', 'train_tokens_per_second': '1977'} +{'loss': '0.7715', 'grad_norm': '1.474', 'learning_rate': '4.998e-05', 'epoch': '0.1695', 'num_input_tokens_seen': 13782451, 'train_runtime': '6971', 'train_tokens_per_second': '1977'} +{'loss': '0.806', 'grad_norm': '1.782', 'learning_rate': '4.998e-05', 'epoch': '0.1695', 'num_input_tokens_seen': 13784498, 'train_runtime': '6972', 'train_tokens_per_second': '1977'} +{'loss': '0.5569', 'grad_norm': '1.127', 'learning_rate': '4.998e-05', 'epoch': '0.1696', 'num_input_tokens_seen': 13786545, 'train_runtime': '6973', 'train_tokens_per_second': '1977'} +{'loss': '0.5801', 'grad_norm': '1.288', 'learning_rate': '4.998e-05', 'epoch': '0.1696', 'num_input_tokens_seen': 13788592, 'train_runtime': '6974', 'train_tokens_per_second': '1977'} +{'loss': '0.5226', 'grad_norm': '1.065', 'learning_rate': '4.998e-05', 'epoch': '0.1696', 'num_input_tokens_seen': 13790639, 'train_runtime': '6975', 'train_tokens_per_second': '1977'} +{'loss': '1.405', 'grad_norm': '1.909', 'learning_rate': '4.998e-05', 'epoch': '0.1697', 'num_input_tokens_seen': 13792686, 'train_runtime': '6976', 'train_tokens_per_second': '1977'} +{'loss': '0.3896', 'grad_norm': '1.016', 'learning_rate': '4.998e-05', 'epoch': '0.1697', 'num_input_tokens_seen': 13794733, 'train_runtime': '6978', 'train_tokens_per_second': '1977'} +{'loss': '0.4138', 'grad_norm': '1.148', 'learning_rate': '4.998e-05', 'epoch': '0.1697', 'num_input_tokens_seen': 13796780, 'train_runtime': '6979', 'train_tokens_per_second': '1977'} +{'loss': '0.6086', 'grad_norm': '1.311', 'learning_rate': '4.998e-05', 'epoch': '0.1697', 'num_input_tokens_seen': 13798827, 'train_runtime': '6980', 'train_tokens_per_second': '1977'} +{'loss': '1.206', 'grad_norm': '1.835', 'learning_rate': '4.998e-05', 'epoch': '0.1698', 'num_input_tokens_seen': 13800874, 'train_runtime': '6981', 'train_tokens_per_second': '1977'} +{'loss': '0.7234', 'grad_norm': '1.424', 'learning_rate': '4.998e-05', 'epoch': '0.1698', 'num_input_tokens_seen': 13802921, 'train_runtime': '6982', 'train_tokens_per_second': '1977'} +{'loss': '0.3982', 'grad_norm': '0.9327', 'learning_rate': '4.997e-05', 'epoch': '0.1698', 'num_input_tokens_seen': 13804968, 'train_runtime': '6983', 'train_tokens_per_second': '1977'} +{'loss': '0.3543', 'grad_norm': '1.034', 'learning_rate': '4.997e-05', 'epoch': '0.1698', 'num_input_tokens_seen': 13807015, 'train_runtime': '6984', 'train_tokens_per_second': '1977'} +{'loss': '0.3698', 'grad_norm': '0.9619', 'learning_rate': '4.997e-05', 'epoch': '0.1699', 'num_input_tokens_seen': 13809062, 'train_runtime': '6985', 'train_tokens_per_second': '1977'} +{'loss': '2.102', 'grad_norm': '2.373', 'learning_rate': '4.997e-05', 'epoch': '0.1699', 'num_input_tokens_seen': 13811109, 'train_runtime': '6986', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '1.34', 'learning_rate': '4.997e-05', 'epoch': '0.1699', 'num_input_tokens_seen': 13813156, 'train_runtime': '6987', 'train_tokens_per_second': '1977'} +{'loss': '0.7553', 'grad_norm': '1.415', 'learning_rate': '4.997e-05', 'epoch': '0.1699', 'num_input_tokens_seen': 13815203, 'train_runtime': '6988', 'train_tokens_per_second': '1977'} +{'loss': '0.4167', 'grad_norm': '1.098', 'learning_rate': '4.997e-05', 'epoch': '0.17', 'num_input_tokens_seen': 13817250, 'train_runtime': '6989', 'train_tokens_per_second': '1977'} +{'loss': '0.3456', 'grad_norm': '1.071', 'learning_rate': '4.997e-05', 'epoch': '0.17', 'num_input_tokens_seen': 13819297, 'train_runtime': '6990', 'train_tokens_per_second': '1977'} +{'loss': '0.5079', 'grad_norm': '1.245', 'learning_rate': '4.997e-05', 'epoch': '0.17', 'num_input_tokens_seen': 13821344, 'train_runtime': '6991', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '1.697', 'learning_rate': '4.997e-05', 'epoch': '0.17', 'num_input_tokens_seen': 13823391, 'train_runtime': '6992', 'train_tokens_per_second': '1977'} +{'loss': '0.3131', 'grad_norm': '0.9058', 'learning_rate': '4.997e-05', 'epoch': '0.1701', 'num_input_tokens_seen': 13825438, 'train_runtime': '6993', 'train_tokens_per_second': '1977'} +{'loss': '0.8744', 'grad_norm': '1.311', 'learning_rate': '4.997e-05', 'epoch': '0.1701', 'num_input_tokens_seen': 13827485, 'train_runtime': '6994', 'train_tokens_per_second': '1977'} +{'loss': '0.2971', 'grad_norm': '1.122', 'learning_rate': '4.997e-05', 'epoch': '0.1701', 'num_input_tokens_seen': 13829532, 'train_runtime': '6995', 'train_tokens_per_second': '1977'} +{'loss': '0.7633', 'grad_norm': '1.375', 'learning_rate': '4.997e-05', 'epoch': '0.1701', 'num_input_tokens_seen': 13831579, 'train_runtime': '6996', 'train_tokens_per_second': '1977'} +{'loss': '0.3272', 'grad_norm': '0.9537', 'learning_rate': '4.997e-05', 'epoch': '0.1702', 'num_input_tokens_seen': 13833626, 'train_runtime': '6997', 'train_tokens_per_second': '1977'} +{'loss': '1.625', 'grad_norm': '2.644', 'learning_rate': '4.997e-05', 'epoch': '0.1702', 'num_input_tokens_seen': 13835673, 'train_runtime': '6998', 'train_tokens_per_second': '1977'} +{'loss': '0.609', 'grad_norm': '1.367', 'learning_rate': '4.997e-05', 'epoch': '0.1702', 'num_input_tokens_seen': 13837720, 'train_runtime': '6999', 'train_tokens_per_second': '1977'} +{'loss': '0.4341', 'grad_norm': '0.9677', 'learning_rate': '4.997e-05', 'epoch': '0.1702', 'num_input_tokens_seen': 13839767, 'train_runtime': '7000', 'train_tokens_per_second': '1977'} +{'loss': '0.305', 'grad_norm': '1.063', 'learning_rate': '4.997e-05', 'epoch': '0.1703', 'num_input_tokens_seen': 13841814, 'train_runtime': '7001', 'train_tokens_per_second': '1977'} +{'loss': '0.6809', 'grad_norm': '1.484', 'learning_rate': '4.997e-05', 'epoch': '0.1703', 'num_input_tokens_seen': 13843861, 'train_runtime': '7002', 'train_tokens_per_second': '1977'} +{'loss': '0.842', 'grad_norm': '1.332', 'learning_rate': '4.997e-05', 'epoch': '0.1703', 'num_input_tokens_seen': 13845908, 'train_runtime': '7003', 'train_tokens_per_second': '1977'} +{'loss': '1.187', 'grad_norm': '1.907', 'learning_rate': '4.997e-05', 'epoch': '0.1703', 'num_input_tokens_seen': 13847955, 'train_runtime': '7004', 'train_tokens_per_second': '1977'} +{'loss': '0.8602', 'grad_norm': '1.847', 'learning_rate': '4.997e-05', 'epoch': '0.1704', 'num_input_tokens_seen': 13850002, 'train_runtime': '7005', 'train_tokens_per_second': '1977'} +{'loss': '1.666', 'grad_norm': '2.032', 'learning_rate': '4.997e-05', 'epoch': '0.1704', 'num_input_tokens_seen': 13852049, 'train_runtime': '7007', 'train_tokens_per_second': '1977'} +{'loss': '0.4638', 'grad_norm': '1.086', 'learning_rate': '4.997e-05', 'epoch': '0.1704', 'num_input_tokens_seen': 13854096, 'train_runtime': '7008', 'train_tokens_per_second': '1977'} +{'loss': '0.3507', 'grad_norm': '0.9622', 'learning_rate': '4.997e-05', 'epoch': '0.1704', 'num_input_tokens_seen': 13856143, 'train_runtime': '7009', 'train_tokens_per_second': '1977'} +{'loss': '0.9861', 'grad_norm': '1.914', 'learning_rate': '4.997e-05', 'epoch': '0.1705', 'num_input_tokens_seen': 13858190, 'train_runtime': '7010', 'train_tokens_per_second': '1977'} +{'loss': '1.516', 'grad_norm': '2.126', 'learning_rate': '4.997e-05', 'epoch': '0.1705', 'num_input_tokens_seen': 13860237, 'train_runtime': '7011', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '1.452', 'learning_rate': '4.997e-05', 'epoch': '0.1705', 'num_input_tokens_seen': 13862284, 'train_runtime': '7012', 'train_tokens_per_second': '1977'} +{'loss': '1.307', 'grad_norm': '1.749', 'learning_rate': '4.997e-05', 'epoch': '0.1705', 'num_input_tokens_seen': 13864331, 'train_runtime': '7013', 'train_tokens_per_second': '1977'} +{'loss': '0.4106', 'grad_norm': '1.039', 'learning_rate': '4.997e-05', 'epoch': '0.1706', 'num_input_tokens_seen': 13866378, 'train_runtime': '7014', 'train_tokens_per_second': '1977'} +{'loss': '0.3016', 'grad_norm': '1.023', 'learning_rate': '4.997e-05', 'epoch': '0.1706', 'num_input_tokens_seen': 13868425, 'train_runtime': '7015', 'train_tokens_per_second': '1977'} +{'loss': '0.3912', 'grad_norm': '1.052', 'learning_rate': '4.997e-05', 'epoch': '0.1706', 'num_input_tokens_seen': 13870472, 'train_runtime': '7016', 'train_tokens_per_second': '1977'} +{'loss': '1.039', 'grad_norm': '2.256', 'learning_rate': '4.997e-05', 'epoch': '0.1706', 'num_input_tokens_seen': 13872519, 'train_runtime': '7017', 'train_tokens_per_second': '1977'} +{'loss': '0.5553', 'grad_norm': '1.461', 'learning_rate': '4.997e-05', 'epoch': '0.1707', 'num_input_tokens_seen': 13874566, 'train_runtime': '7018', 'train_tokens_per_second': '1977'} +{'loss': '0.5927', 'grad_norm': '1.554', 'learning_rate': '4.997e-05', 'epoch': '0.1707', 'num_input_tokens_seen': 13876613, 'train_runtime': '7019', 'train_tokens_per_second': '1977'} +{'loss': '0.4793', 'grad_norm': '1.152', 'learning_rate': '4.997e-05', 'epoch': '0.1707', 'num_input_tokens_seen': 13878660, 'train_runtime': '7020', 'train_tokens_per_second': '1977'} +{'loss': '0.9369', 'grad_norm': '1.526', 'learning_rate': '4.997e-05', 'epoch': '0.1707', 'num_input_tokens_seen': 13880707, 'train_runtime': '7021', 'train_tokens_per_second': '1977'} +{'loss': '0.2415', 'grad_norm': '0.9503', 'learning_rate': '4.997e-05', 'epoch': '0.1708', 'num_input_tokens_seen': 13882754, 'train_runtime': '7022', 'train_tokens_per_second': '1977'} +{'loss': '0.739', 'grad_norm': '1.121', 'learning_rate': '4.997e-05', 'epoch': '0.1708', 'num_input_tokens_seen': 13884801, 'train_runtime': '7023', 'train_tokens_per_second': '1977'} +{'loss': '0.7736', 'grad_norm': '1.089', 'learning_rate': '4.997e-05', 'epoch': '0.1708', 'num_input_tokens_seen': 13886848, 'train_runtime': '7024', 'train_tokens_per_second': '1977'} +{'loss': '2.194', 'grad_norm': '2.873', 'learning_rate': '4.997e-05', 'epoch': '0.1708', 'num_input_tokens_seen': 13888895, 'train_runtime': '7025', 'train_tokens_per_second': '1977'} +{'loss': '2.276', 'grad_norm': '2.466', 'learning_rate': '4.997e-05', 'epoch': '0.1709', 'num_input_tokens_seen': 13890942, 'train_runtime': '7026', 'train_tokens_per_second': '1977'} +{'loss': '0.6811', 'grad_norm': '1.31', 'learning_rate': '4.997e-05', 'epoch': '0.1709', 'num_input_tokens_seen': 13892989, 'train_runtime': '7027', 'train_tokens_per_second': '1977'} +{'loss': '0.6525', 'grad_norm': '1.48', 'learning_rate': '4.997e-05', 'epoch': '0.1709', 'num_input_tokens_seen': 13895036, 'train_runtime': '7028', 'train_tokens_per_second': '1977'} +{'loss': '0.6746', 'grad_norm': '1.395', 'learning_rate': '4.997e-05', 'epoch': '0.1709', 'num_input_tokens_seen': 13897083, 'train_runtime': '7029', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '1.923', 'learning_rate': '4.997e-05', 'epoch': '0.171', 'num_input_tokens_seen': 13899130, 'train_runtime': '7030', 'train_tokens_per_second': '1977'} +{'loss': '1.19', 'grad_norm': '1.907', 'learning_rate': '4.997e-05', 'epoch': '0.171', 'num_input_tokens_seen': 13901177, 'train_runtime': '7031', 'train_tokens_per_second': '1977'} +{'loss': '1.121', 'grad_norm': '1.631', 'learning_rate': '4.997e-05', 'epoch': '0.171', 'num_input_tokens_seen': 13903224, 'train_runtime': '7032', 'train_tokens_per_second': '1977'} +{'loss': '0.9566', 'grad_norm': '1.398', 'learning_rate': '4.997e-05', 'epoch': '0.171', 'num_input_tokens_seen': 13905271, 'train_runtime': '7033', 'train_tokens_per_second': '1977'} +{'loss': '0.8924', 'grad_norm': '1.332', 'learning_rate': '4.997e-05', 'epoch': '0.1711', 'num_input_tokens_seen': 13907318, 'train_runtime': '7034', 'train_tokens_per_second': '1977'} +{'loss': '0.4794', 'grad_norm': '0.9235', 'learning_rate': '4.997e-05', 'epoch': '0.1711', 'num_input_tokens_seen': 13909365, 'train_runtime': '7035', 'train_tokens_per_second': '1977'} +{'loss': '0.7949', 'grad_norm': '1.467', 'learning_rate': '4.997e-05', 'epoch': '0.1711', 'num_input_tokens_seen': 13911412, 'train_runtime': '7037', 'train_tokens_per_second': '1977'} +{'loss': '0.6971', 'grad_norm': '1.074', 'learning_rate': '4.997e-05', 'epoch': '0.1711', 'num_input_tokens_seen': 13913459, 'train_runtime': '7038', 'train_tokens_per_second': '1977'} +{'loss': '0.8107', 'grad_norm': '1.583', 'learning_rate': '4.997e-05', 'epoch': '0.1712', 'num_input_tokens_seen': 13915506, 'train_runtime': '7039', 'train_tokens_per_second': '1977'} +{'loss': '1.048', 'grad_norm': '1.385', 'learning_rate': '4.997e-05', 'epoch': '0.1712', 'num_input_tokens_seen': 13917553, 'train_runtime': '7040', 'train_tokens_per_second': '1977'} +{'loss': '0.9549', 'grad_norm': '1.807', 'learning_rate': '4.997e-05', 'epoch': '0.1712', 'num_input_tokens_seen': 13919600, 'train_runtime': '7041', 'train_tokens_per_second': '1977'} +{'loss': '0.4547', 'grad_norm': '0.8557', 'learning_rate': '4.997e-05', 'epoch': '0.1712', 'num_input_tokens_seen': 13921647, 'train_runtime': '7042', 'train_tokens_per_second': '1977'} +{'loss': '0.696', 'grad_norm': '1.248', 'learning_rate': '4.997e-05', 'epoch': '0.1713', 'num_input_tokens_seen': 13923694, 'train_runtime': '7043', 'train_tokens_per_second': '1977'} +{'loss': '1.223', 'grad_norm': '1.988', 'learning_rate': '4.997e-05', 'epoch': '0.1713', 'num_input_tokens_seen': 13925741, 'train_runtime': '7044', 'train_tokens_per_second': '1977'} +{'loss': '0.7051', 'grad_norm': '1.181', 'learning_rate': '4.997e-05', 'epoch': '0.1713', 'num_input_tokens_seen': 13927788, 'train_runtime': '7045', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '1.478', 'learning_rate': '4.997e-05', 'epoch': '0.1713', 'num_input_tokens_seen': 13929835, 'train_runtime': '7046', 'train_tokens_per_second': '1977'} +{'loss': '0.5817', 'grad_norm': '1.179', 'learning_rate': '4.997e-05', 'epoch': '0.1714', 'num_input_tokens_seen': 13931882, 'train_runtime': '7047', 'train_tokens_per_second': '1977'} +{'loss': '0.7924', 'grad_norm': '1.575', 'learning_rate': '4.997e-05', 'epoch': '0.1714', 'num_input_tokens_seen': 13933929, 'train_runtime': '7048', 'train_tokens_per_second': '1977'} +{'loss': '1.002', 'grad_norm': '1.32', 'learning_rate': '4.997e-05', 'epoch': '0.1714', 'num_input_tokens_seen': 13935976, 'train_runtime': '7049', 'train_tokens_per_second': '1977'} +{'loss': '0.261', 'grad_norm': '0.908', 'learning_rate': '4.997e-05', 'epoch': '0.1714', 'num_input_tokens_seen': 13938023, 'train_runtime': '7050', 'train_tokens_per_second': '1977'} +{'loss': '0.8109', 'grad_norm': '1.421', 'learning_rate': '4.997e-05', 'epoch': '0.1715', 'num_input_tokens_seen': 13940070, 'train_runtime': '7051', 'train_tokens_per_second': '1977'} +{'loss': '0.3899', 'grad_norm': '1.031', 'learning_rate': '4.997e-05', 'epoch': '0.1715', 'num_input_tokens_seen': 13942117, 'train_runtime': '7052', 'train_tokens_per_second': '1977'} +{'loss': '0.7726', 'grad_norm': '1.411', 'learning_rate': '4.997e-05', 'epoch': '0.1715', 'num_input_tokens_seen': 13944164, 'train_runtime': '7053', 'train_tokens_per_second': '1977'} +{'loss': '0.6036', 'grad_norm': '1.176', 'learning_rate': '4.997e-05', 'epoch': '0.1715', 'num_input_tokens_seen': 13946211, 'train_runtime': '7054', 'train_tokens_per_second': '1977'} +{'loss': '1.383', 'grad_norm': '2.855', 'learning_rate': '4.997e-05', 'epoch': '0.1716', 'num_input_tokens_seen': 13948258, 'train_runtime': '7055', 'train_tokens_per_second': '1977'} +{'loss': '1.375', 'grad_norm': '2.197', 'learning_rate': '4.997e-05', 'epoch': '0.1716', 'num_input_tokens_seen': 13950305, 'train_runtime': '7056', 'train_tokens_per_second': '1977'} +{'loss': '2.246', 'grad_norm': '2.767', 'learning_rate': '4.997e-05', 'epoch': '0.1716', 'num_input_tokens_seen': 13952352, 'train_runtime': '7057', 'train_tokens_per_second': '1977'} +{'loss': '0.8123', 'grad_norm': '1.345', 'learning_rate': '4.997e-05', 'epoch': '0.1716', 'num_input_tokens_seen': 13954399, 'train_runtime': '7058', 'train_tokens_per_second': '1977'} +{'loss': '1.739', 'grad_norm': '2.276', 'learning_rate': '4.997e-05', 'epoch': '0.1717', 'num_input_tokens_seen': 13956446, 'train_runtime': '7059', 'train_tokens_per_second': '1977'} +{'loss': '0.4866', 'grad_norm': '1.323', 'learning_rate': '4.997e-05', 'epoch': '0.1717', 'num_input_tokens_seen': 13958493, 'train_runtime': '7060', 'train_tokens_per_second': '1977'} +{'loss': '0.3321', 'grad_norm': '0.9638', 'learning_rate': '4.997e-05', 'epoch': '0.1717', 'num_input_tokens_seen': 13960540, 'train_runtime': '7061', 'train_tokens_per_second': '1977'} +{'loss': '0.7631', 'grad_norm': '1.071', 'learning_rate': '4.997e-05', 'epoch': '0.1717', 'num_input_tokens_seen': 13962587, 'train_runtime': '7062', 'train_tokens_per_second': '1977'} +{'loss': '2.2', 'grad_norm': '2.043', 'learning_rate': '4.997e-05', 'epoch': '0.1718', 'num_input_tokens_seen': 13964634, 'train_runtime': '7063', 'train_tokens_per_second': '1977'} +{'loss': '0.9286', 'grad_norm': '1.342', 'learning_rate': '4.997e-05', 'epoch': '0.1718', 'num_input_tokens_seen': 13966681, 'train_runtime': '7064', 'train_tokens_per_second': '1977'} +{'loss': '0.3841', 'grad_norm': '0.9631', 'learning_rate': '4.997e-05', 'epoch': '0.1718', 'num_input_tokens_seen': 13968728, 'train_runtime': '7066', 'train_tokens_per_second': '1977'} +{'loss': '2.279', 'grad_norm': '2.376', 'learning_rate': '4.997e-05', 'epoch': '0.1718', 'num_input_tokens_seen': 13970775, 'train_runtime': '7067', 'train_tokens_per_second': '1977'} +{'loss': '1.336', 'grad_norm': '1.84', 'learning_rate': '4.997e-05', 'epoch': '0.1719', 'num_input_tokens_seen': 13972822, 'train_runtime': '7068', 'train_tokens_per_second': '1977'} +{'loss': '0.5911', 'grad_norm': '1.263', 'learning_rate': '4.997e-05', 'epoch': '0.1719', 'num_input_tokens_seen': 13974869, 'train_runtime': '7069', 'train_tokens_per_second': '1977'} +{'loss': '0.699', 'grad_norm': '1.117', 'learning_rate': '4.997e-05', 'epoch': '0.1719', 'num_input_tokens_seen': 13976916, 'train_runtime': '7070', 'train_tokens_per_second': '1977'} +{'loss': '0.2707', 'grad_norm': '1.045', 'learning_rate': '4.997e-05', 'epoch': '0.1719', 'num_input_tokens_seen': 13978963, 'train_runtime': '7071', 'train_tokens_per_second': '1977'} +{'loss': '0.4209', 'grad_norm': '1.091', 'learning_rate': '4.997e-05', 'epoch': '0.172', 'num_input_tokens_seen': 13981010, 'train_runtime': '7072', 'train_tokens_per_second': '1977'} +{'loss': '0.3352', 'grad_norm': '0.9228', 'learning_rate': '4.997e-05', 'epoch': '0.172', 'num_input_tokens_seen': 13983057, 'train_runtime': '7073', 'train_tokens_per_second': '1977'} +{'loss': '0.6347', 'grad_norm': '1.252', 'learning_rate': '4.997e-05', 'epoch': '0.172', 'num_input_tokens_seen': 13985104, 'train_runtime': '7074', 'train_tokens_per_second': '1977'} +{'loss': '0.5191', 'grad_norm': '1.232', 'learning_rate': '4.997e-05', 'epoch': '0.172', 'num_input_tokens_seen': 13987151, 'train_runtime': '7075', 'train_tokens_per_second': '1977'} +{'loss': '1.214', 'grad_norm': '1.691', 'learning_rate': '4.997e-05', 'epoch': '0.1721', 'num_input_tokens_seen': 13989198, 'train_runtime': '7076', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '1.674', 'learning_rate': '4.997e-05', 'epoch': '0.1721', 'num_input_tokens_seen': 13991245, 'train_runtime': '7077', 'train_tokens_per_second': '1977'} +{'loss': '0.7208', 'grad_norm': '1.01', 'learning_rate': '4.997e-05', 'epoch': '0.1721', 'num_input_tokens_seen': 13993292, 'train_runtime': '7078', 'train_tokens_per_second': '1977'} +{'loss': '0.2733', 'grad_norm': '0.9164', 'learning_rate': '4.997e-05', 'epoch': '0.1721', 'num_input_tokens_seen': 13995339, 'train_runtime': '7079', 'train_tokens_per_second': '1977'} +{'loss': '0.9159', 'grad_norm': '1.401', 'learning_rate': '4.997e-05', 'epoch': '0.1722', 'num_input_tokens_seen': 13997386, 'train_runtime': '7080', 'train_tokens_per_second': '1977'} +{'loss': '1.427', 'grad_norm': '2.012', 'learning_rate': '4.997e-05', 'epoch': '0.1722', 'num_input_tokens_seen': 13999433, 'train_runtime': '7081', 'train_tokens_per_second': '1977'} +{'loss': '1.336', 'grad_norm': '1.962', 'learning_rate': '4.997e-05', 'epoch': '0.1722', 'num_input_tokens_seen': 14001480, 'train_runtime': '7082', 'train_tokens_per_second': '1977'} +{'loss': '1.282', 'grad_norm': '1.867', 'learning_rate': '4.997e-05', 'epoch': '0.1722', 'num_input_tokens_seen': 14003527, 'train_runtime': '7083', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '1.471', 'learning_rate': '4.997e-05', 'epoch': '0.1723', 'num_input_tokens_seen': 14005574, 'train_runtime': '7084', 'train_tokens_per_second': '1977'} +{'loss': '0.5458', 'grad_norm': '1.338', 'learning_rate': '4.997e-05', 'epoch': '0.1723', 'num_input_tokens_seen': 14007621, 'train_runtime': '7085', 'train_tokens_per_second': '1977'} +{'loss': '0.8581', 'grad_norm': '1.384', 'learning_rate': '4.997e-05', 'epoch': '0.1723', 'num_input_tokens_seen': 14009668, 'train_runtime': '7086', 'train_tokens_per_second': '1977'} +{'loss': '1.981', 'grad_norm': '2.419', 'learning_rate': '4.997e-05', 'epoch': '0.1723', 'num_input_tokens_seen': 14011715, 'train_runtime': '7087', 'train_tokens_per_second': '1977'} +{'loss': '0.457', 'grad_norm': '1.04', 'learning_rate': '4.997e-05', 'epoch': '0.1724', 'num_input_tokens_seen': 14013762, 'train_runtime': '7088', 'train_tokens_per_second': '1977'} +{'loss': '0.449', 'grad_norm': '0.9266', 'learning_rate': '4.997e-05', 'epoch': '0.1724', 'num_input_tokens_seen': 14015809, 'train_runtime': '7089', 'train_tokens_per_second': '1977'} +{'loss': '0.6789', 'grad_norm': '1.236', 'learning_rate': '4.997e-05', 'epoch': '0.1724', 'num_input_tokens_seen': 14017856, 'train_runtime': '7090', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '1.717', 'learning_rate': '4.997e-05', 'epoch': '0.1724', 'num_input_tokens_seen': 14019903, 'train_runtime': '7091', 'train_tokens_per_second': '1977'} +{'loss': '0.7015', 'grad_norm': '1.125', 'learning_rate': '4.997e-05', 'epoch': '0.1725', 'num_input_tokens_seen': 14021950, 'train_runtime': '7092', 'train_tokens_per_second': '1977'} +{'loss': '2.399', 'grad_norm': '3.86', 'learning_rate': '4.997e-05', 'epoch': '0.1725', 'num_input_tokens_seen': 14023997, 'train_runtime': '7093', 'train_tokens_per_second': '1977'} +{'loss': '0.445', 'grad_norm': '1.192', 'learning_rate': '4.997e-05', 'epoch': '0.1725', 'num_input_tokens_seen': 14026044, 'train_runtime': '7095', 'train_tokens_per_second': '1977'} +{'loss': '0.6523', 'grad_norm': '0.9886', 'learning_rate': '4.997e-05', 'epoch': '0.1725', 'num_input_tokens_seen': 14028091, 'train_runtime': '7096', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.823', 'learning_rate': '4.997e-05', 'epoch': '0.1726', 'num_input_tokens_seen': 14030138, 'train_runtime': '7097', 'train_tokens_per_second': '1977'} +{'loss': '1.743', 'grad_norm': '2.135', 'learning_rate': '4.997e-05', 'epoch': '0.1726', 'num_input_tokens_seen': 14032185, 'train_runtime': '7098', 'train_tokens_per_second': '1977'} +{'loss': '0.6514', 'grad_norm': '0.955', 'learning_rate': '4.997e-05', 'epoch': '0.1726', 'num_input_tokens_seen': 14034232, 'train_runtime': '7099', 'train_tokens_per_second': '1977'} +{'loss': '2.221', 'grad_norm': '2.317', 'learning_rate': '4.997e-05', 'epoch': '0.1726', 'num_input_tokens_seen': 14036279, 'train_runtime': '7100', 'train_tokens_per_second': '1977'} +{'loss': '2.124', 'grad_norm': '2.43', 'learning_rate': '4.997e-05', 'epoch': '0.1727', 'num_input_tokens_seen': 14038326, 'train_runtime': '7101', 'train_tokens_per_second': '1977'} +{'loss': '0.2168', 'grad_norm': '0.9984', 'learning_rate': '4.997e-05', 'epoch': '0.1727', 'num_input_tokens_seen': 14040373, 'train_runtime': '7102', 'train_tokens_per_second': '1977'} +{'loss': '1.281', 'grad_norm': '1.963', 'learning_rate': '4.997e-05', 'epoch': '0.1727', 'num_input_tokens_seen': 14042420, 'train_runtime': '7103', 'train_tokens_per_second': '1977'} +{'loss': '1.351', 'grad_norm': '2.099', 'learning_rate': '4.997e-05', 'epoch': '0.1727', 'num_input_tokens_seen': 14044467, 'train_runtime': '7104', 'train_tokens_per_second': '1977'} +{'loss': '0.2465', 'grad_norm': '0.9138', 'learning_rate': '4.997e-05', 'epoch': '0.1728', 'num_input_tokens_seen': 14046514, 'train_runtime': '7105', 'train_tokens_per_second': '1977'} +{'loss': '0.6159', 'grad_norm': '1.004', 'learning_rate': '4.997e-05', 'epoch': '0.1728', 'num_input_tokens_seen': 14048561, 'train_runtime': '7106', 'train_tokens_per_second': '1977'} +{'loss': '0.7946', 'grad_norm': '1.434', 'learning_rate': '4.997e-05', 'epoch': '0.1728', 'num_input_tokens_seen': 14050608, 'train_runtime': '7107', 'train_tokens_per_second': '1977'} +{'loss': '1.161', 'grad_norm': '2.252', 'learning_rate': '4.997e-05', 'epoch': '0.1728', 'num_input_tokens_seen': 14052655, 'train_runtime': '7108', 'train_tokens_per_second': '1977'} +{'loss': '0.4944', 'grad_norm': '1.025', 'learning_rate': '4.997e-05', 'epoch': '0.1729', 'num_input_tokens_seen': 14054702, 'train_runtime': '7109', 'train_tokens_per_second': '1977'} +{'loss': '0.2822', 'grad_norm': '0.9245', 'learning_rate': '4.997e-05', 'epoch': '0.1729', 'num_input_tokens_seen': 14056749, 'train_runtime': '7110', 'train_tokens_per_second': '1977'} +{'loss': '0.5469', 'grad_norm': '1.14', 'learning_rate': '4.997e-05', 'epoch': '0.1729', 'num_input_tokens_seen': 14058796, 'train_runtime': '7111', 'train_tokens_per_second': '1977'} +{'loss': '0.5129', 'grad_norm': '1.092', 'learning_rate': '4.997e-05', 'epoch': '0.1729', 'num_input_tokens_seen': 14060843, 'train_runtime': '7112', 'train_tokens_per_second': '1977'} +{'loss': '0.7047', 'grad_norm': '1.346', 'learning_rate': '4.997e-05', 'epoch': '0.173', 'num_input_tokens_seen': 14062890, 'train_runtime': '7113', 'train_tokens_per_second': '1977'} +{'loss': '0.3843', 'grad_norm': '0.9864', 'learning_rate': '4.997e-05', 'epoch': '0.173', 'num_input_tokens_seen': 14064937, 'train_runtime': '7114', 'train_tokens_per_second': '1977'} +{'loss': '0.9056', 'grad_norm': '1.528', 'learning_rate': '4.997e-05', 'epoch': '0.173', 'num_input_tokens_seen': 14066984, 'train_runtime': '7115', 'train_tokens_per_second': '1977'} +{'loss': '1.46', 'grad_norm': '1.722', 'learning_rate': '4.997e-05', 'epoch': '0.173', 'num_input_tokens_seen': 14069031, 'train_runtime': '7116', 'train_tokens_per_second': '1977'} +{'loss': '1.258', 'grad_norm': '1.809', 'learning_rate': '4.997e-05', 'epoch': '0.1731', 'num_input_tokens_seen': 14071078, 'train_runtime': '7117', 'train_tokens_per_second': '1977'} +{'loss': '0.3821', 'grad_norm': '0.743', 'learning_rate': '4.997e-05', 'epoch': '0.1731', 'num_input_tokens_seen': 14073125, 'train_runtime': '7118', 'train_tokens_per_second': '1977'} +{'loss': '1.452', 'grad_norm': '2.074', 'learning_rate': '4.997e-05', 'epoch': '0.1731', 'num_input_tokens_seen': 14075172, 'train_runtime': '7119', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '1.997', 'learning_rate': '4.997e-05', 'epoch': '0.1732', 'num_input_tokens_seen': 14077219, 'train_runtime': '7120', 'train_tokens_per_second': '1977'} +{'loss': '0.3375', 'grad_norm': '0.9637', 'learning_rate': '4.997e-05', 'epoch': '0.1732', 'num_input_tokens_seen': 14079266, 'train_runtime': '7121', 'train_tokens_per_second': '1977'} +{'loss': '0.4915', 'grad_norm': '0.9254', 'learning_rate': '4.997e-05', 'epoch': '0.1732', 'num_input_tokens_seen': 14081313, 'train_runtime': '7122', 'train_tokens_per_second': '1977'} +{'loss': '0.4719', 'grad_norm': '1.003', 'learning_rate': '4.997e-05', 'epoch': '0.1732', 'num_input_tokens_seen': 14083360, 'train_runtime': '7124', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.558', 'learning_rate': '4.997e-05', 'epoch': '0.1733', 'num_input_tokens_seen': 14085407, 'train_runtime': '7125', 'train_tokens_per_second': '1977'} +{'loss': '0.7443', 'grad_norm': '1.574', 'learning_rate': '4.997e-05', 'epoch': '0.1733', 'num_input_tokens_seen': 14087454, 'train_runtime': '7126', 'train_tokens_per_second': '1977'} +{'loss': '0.4214', 'grad_norm': '0.9313', 'learning_rate': '4.997e-05', 'epoch': '0.1733', 'num_input_tokens_seen': 14089501, 'train_runtime': '7127', 'train_tokens_per_second': '1977'} +{'loss': '0.3061', 'grad_norm': '1.033', 'learning_rate': '4.997e-05', 'epoch': '0.1733', 'num_input_tokens_seen': 14091548, 'train_runtime': '7128', 'train_tokens_per_second': '1977'} +{'loss': '1.775', 'grad_norm': '2.368', 'learning_rate': '4.997e-05', 'epoch': '0.1734', 'num_input_tokens_seen': 14093595, 'train_runtime': '7129', 'train_tokens_per_second': '1977'} +{'loss': '0.9903', 'grad_norm': '1.577', 'learning_rate': '4.997e-05', 'epoch': '0.1734', 'num_input_tokens_seen': 14095642, 'train_runtime': '7130', 'train_tokens_per_second': '1977'} +{'loss': '0.2342', 'grad_norm': '1.088', 'learning_rate': '4.997e-05', 'epoch': '0.1734', 'num_input_tokens_seen': 14097689, 'train_runtime': '7131', 'train_tokens_per_second': '1977'} +{'loss': '0.6683', 'grad_norm': '1.271', 'learning_rate': '4.997e-05', 'epoch': '0.1734', 'num_input_tokens_seen': 14099736, 'train_runtime': '7132', 'train_tokens_per_second': '1977'} +{'loss': '0.5321', 'grad_norm': '1.219', 'learning_rate': '4.997e-05', 'epoch': '0.1735', 'num_input_tokens_seen': 14101783, 'train_runtime': '7133', 'train_tokens_per_second': '1977'} +{'loss': '0.6077', 'grad_norm': '1.367', 'learning_rate': '4.997e-05', 'epoch': '0.1735', 'num_input_tokens_seen': 14103830, 'train_runtime': '7134', 'train_tokens_per_second': '1977'} +{'loss': '1.386', 'grad_norm': '1.905', 'learning_rate': '4.997e-05', 'epoch': '0.1735', 'num_input_tokens_seen': 14105877, 'train_runtime': '7135', 'train_tokens_per_second': '1977'} +{'loss': '1.643', 'grad_norm': '2.392', 'learning_rate': '4.997e-05', 'epoch': '0.1735', 'num_input_tokens_seen': 14107924, 'train_runtime': '7136', 'train_tokens_per_second': '1977'} +{'loss': '0.9115', 'grad_norm': '1.465', 'learning_rate': '4.997e-05', 'epoch': '0.1736', 'num_input_tokens_seen': 14109971, 'train_runtime': '7137', 'train_tokens_per_second': '1977'} +{'loss': '1.028', 'grad_norm': '1.622', 'learning_rate': '4.997e-05', 'epoch': '0.1736', 'num_input_tokens_seen': 14112018, 'train_runtime': '7138', 'train_tokens_per_second': '1977'} +{'loss': '0.9407', 'grad_norm': '1.243', 'learning_rate': '4.997e-05', 'epoch': '0.1736', 'num_input_tokens_seen': 14114065, 'train_runtime': '7139', 'train_tokens_per_second': '1977'} +{'loss': '0.5071', 'grad_norm': '1.187', 'learning_rate': '4.997e-05', 'epoch': '0.1736', 'num_input_tokens_seen': 14116112, 'train_runtime': '7140', 'train_tokens_per_second': '1977'} +{'loss': '0.3735', 'grad_norm': '0.9628', 'learning_rate': '4.997e-05', 'epoch': '0.1737', 'num_input_tokens_seen': 14118159, 'train_runtime': '7141', 'train_tokens_per_second': '1977'} +{'loss': '0.8691', 'grad_norm': '1.836', 'learning_rate': '4.997e-05', 'epoch': '0.1737', 'num_input_tokens_seen': 14120206, 'train_runtime': '7142', 'train_tokens_per_second': '1977'} +{'loss': '0.4619', 'grad_norm': '1.024', 'learning_rate': '4.997e-05', 'epoch': '0.1737', 'num_input_tokens_seen': 14122253, 'train_runtime': '7143', 'train_tokens_per_second': '1977'} +{'loss': '0.4247', 'grad_norm': '1.365', 'learning_rate': '4.997e-05', 'epoch': '0.1737', 'num_input_tokens_seen': 14124300, 'train_runtime': '7144', 'train_tokens_per_second': '1977'} +{'loss': '0.3847', 'grad_norm': '1.042', 'learning_rate': '4.997e-05', 'epoch': '0.1738', 'num_input_tokens_seen': 14126347, 'train_runtime': '7145', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '1.748', 'learning_rate': '4.997e-05', 'epoch': '0.1738', 'num_input_tokens_seen': 14128394, 'train_runtime': '7146', 'train_tokens_per_second': '1977'} +{'loss': '1.34', 'grad_norm': '2.102', 'learning_rate': '4.997e-05', 'epoch': '0.1738', 'num_input_tokens_seen': 14130441, 'train_runtime': '7147', 'train_tokens_per_second': '1977'} +{'loss': '0.8809', 'grad_norm': '1.519', 'learning_rate': '4.997e-05', 'epoch': '0.1738', 'num_input_tokens_seen': 14132488, 'train_runtime': '7148', 'train_tokens_per_second': '1977'} +{'loss': '0.7915', 'grad_norm': '1.494', 'learning_rate': '4.997e-05', 'epoch': '0.1739', 'num_input_tokens_seen': 14134535, 'train_runtime': '7149', 'train_tokens_per_second': '1977'} +{'loss': '0.3818', 'grad_norm': '1.163', 'learning_rate': '4.997e-05', 'epoch': '0.1739', 'num_input_tokens_seen': 14136582, 'train_runtime': '7150', 'train_tokens_per_second': '1977'} +{'loss': '0.9247', 'grad_norm': '1.44', 'learning_rate': '4.997e-05', 'epoch': '0.1739', 'num_input_tokens_seen': 14138629, 'train_runtime': '7152', 'train_tokens_per_second': '1977'} +{'loss': '0.4223', 'grad_norm': '0.8863', 'learning_rate': '4.997e-05', 'epoch': '0.1739', 'num_input_tokens_seen': 14140676, 'train_runtime': '7153', 'train_tokens_per_second': '1977'} +{'loss': '0.7564', 'grad_norm': '0.9713', 'learning_rate': '4.997e-05', 'epoch': '0.174', 'num_input_tokens_seen': 14142723, 'train_runtime': '7154', 'train_tokens_per_second': '1977'} +{'loss': '1.095', 'grad_norm': '1.624', 'learning_rate': '4.997e-05', 'epoch': '0.174', 'num_input_tokens_seen': 14144770, 'train_runtime': '7155', 'train_tokens_per_second': '1977'} +{'loss': '1.037', 'grad_norm': '1.836', 'learning_rate': '4.997e-05', 'epoch': '0.174', 'num_input_tokens_seen': 14146817, 'train_runtime': '7156', 'train_tokens_per_second': '1977'} +{'loss': '1.394', 'grad_norm': '1.917', 'learning_rate': '4.997e-05', 'epoch': '0.174', 'num_input_tokens_seen': 14148864, 'train_runtime': '7157', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '1.574', 'learning_rate': '4.997e-05', 'epoch': '0.1741', 'num_input_tokens_seen': 14150911, 'train_runtime': '7158', 'train_tokens_per_second': '1977'} +{'loss': '0.7574', 'grad_norm': '1.218', 'learning_rate': '4.997e-05', 'epoch': '0.1741', 'num_input_tokens_seen': 14152958, 'train_runtime': '7159', 'train_tokens_per_second': '1977'} +{'loss': '0.3427', 'grad_norm': '1.011', 'learning_rate': '4.997e-05', 'epoch': '0.1741', 'num_input_tokens_seen': 14155005, 'train_runtime': '7160', 'train_tokens_per_second': '1977'} +{'loss': '0.751', 'grad_norm': '1.682', 'learning_rate': '4.997e-05', 'epoch': '0.1741', 'num_input_tokens_seen': 14157052, 'train_runtime': '7161', 'train_tokens_per_second': '1977'} +{'loss': '0.9474', 'grad_norm': '1.891', 'learning_rate': '4.997e-05', 'epoch': '0.1742', 'num_input_tokens_seen': 14159099, 'train_runtime': '7162', 'train_tokens_per_second': '1977'} +{'loss': '1.043', 'grad_norm': '1.523', 'learning_rate': '4.997e-05', 'epoch': '0.1742', 'num_input_tokens_seen': 14161146, 'train_runtime': '7163', 'train_tokens_per_second': '1977'} +{'loss': '2.074', 'grad_norm': '2.234', 'learning_rate': '4.997e-05', 'epoch': '0.1742', 'num_input_tokens_seen': 14163193, 'train_runtime': '7164', 'train_tokens_per_second': '1977'} +{'loss': '1.141', 'grad_norm': '1.931', 'learning_rate': '4.997e-05', 'epoch': '0.1742', 'num_input_tokens_seen': 14165240, 'train_runtime': '7165', 'train_tokens_per_second': '1977'} +{'loss': '0.8579', 'grad_norm': '1.646', 'learning_rate': '4.997e-05', 'epoch': '0.1743', 'num_input_tokens_seen': 14167287, 'train_runtime': '7166', 'train_tokens_per_second': '1977'} +{'loss': '1.178', 'grad_norm': '1.605', 'learning_rate': '4.997e-05', 'epoch': '0.1743', 'num_input_tokens_seen': 14169334, 'train_runtime': '7167', 'train_tokens_per_second': '1977'} +{'loss': '0.5982', 'grad_norm': '1.195', 'learning_rate': '4.997e-05', 'epoch': '0.1743', 'num_input_tokens_seen': 14171381, 'train_runtime': '7168', 'train_tokens_per_second': '1977'} +{'loss': '0.9776', 'grad_norm': '1.912', 'learning_rate': '4.997e-05', 'epoch': '0.1743', 'num_input_tokens_seen': 14173428, 'train_runtime': '7169', 'train_tokens_per_second': '1977'} +{'loss': '0.3247', 'grad_norm': '1.092', 'learning_rate': '4.997e-05', 'epoch': '0.1744', 'num_input_tokens_seen': 14175475, 'train_runtime': '7170', 'train_tokens_per_second': '1977'} +{'loss': '0.7461', 'grad_norm': '0.9933', 'learning_rate': '4.997e-05', 'epoch': '0.1744', 'num_input_tokens_seen': 14177522, 'train_runtime': '7171', 'train_tokens_per_second': '1977'} +{'loss': '0.6759', 'grad_norm': '1.294', 'learning_rate': '4.997e-05', 'epoch': '0.1744', 'num_input_tokens_seen': 14179569, 'train_runtime': '7172', 'train_tokens_per_second': '1977'} +{'loss': '0.7793', 'grad_norm': '1.425', 'learning_rate': '4.997e-05', 'epoch': '0.1744', 'num_input_tokens_seen': 14181616, 'train_runtime': '7173', 'train_tokens_per_second': '1977'} +{'loss': '1.122', 'grad_norm': '2.085', 'learning_rate': '4.997e-05', 'epoch': '0.1745', 'num_input_tokens_seen': 14183663, 'train_runtime': '7174', 'train_tokens_per_second': '1977'} +{'loss': '0.8915', 'grad_norm': '1.517', 'learning_rate': '4.997e-05', 'epoch': '0.1745', 'num_input_tokens_seen': 14185710, 'train_runtime': '7175', 'train_tokens_per_second': '1977'} +{'loss': '0.7013', 'grad_norm': '1.38', 'learning_rate': '4.997e-05', 'epoch': '0.1745', 'num_input_tokens_seen': 14187757, 'train_runtime': '7176', 'train_tokens_per_second': '1977'} +{'loss': '0.3152', 'grad_norm': '0.9807', 'learning_rate': '4.997e-05', 'epoch': '0.1745', 'num_input_tokens_seen': 14189804, 'train_runtime': '7177', 'train_tokens_per_second': '1977'} +{'loss': '0.7743', 'grad_norm': '1.598', 'learning_rate': '4.997e-05', 'epoch': '0.1746', 'num_input_tokens_seen': 14191851, 'train_runtime': '7178', 'train_tokens_per_second': '1977'} +{'loss': '0.7288', 'grad_norm': '1.46', 'learning_rate': '4.997e-05', 'epoch': '0.1746', 'num_input_tokens_seen': 14193898, 'train_runtime': '7179', 'train_tokens_per_second': '1977'} +{'loss': '1.45', 'grad_norm': '1.974', 'learning_rate': '4.997e-05', 'epoch': '0.1746', 'num_input_tokens_seen': 14195945, 'train_runtime': '7180', 'train_tokens_per_second': '1977'} +{'loss': '0.8917', 'grad_norm': '1.867', 'learning_rate': '4.997e-05', 'epoch': '0.1746', 'num_input_tokens_seen': 14197992, 'train_runtime': '7182', 'train_tokens_per_second': '1977'} +{'loss': '1.644', 'grad_norm': '2.446', 'learning_rate': '4.997e-05', 'epoch': '0.1747', 'num_input_tokens_seen': 14200039, 'train_runtime': '7183', 'train_tokens_per_second': '1977'} +{'loss': '1.618', 'grad_norm': '2.225', 'learning_rate': '4.997e-05', 'epoch': '0.1747', 'num_input_tokens_seen': 14202086, 'train_runtime': '7184', 'train_tokens_per_second': '1977'} +{'loss': '1.169', 'grad_norm': '1.323', 'learning_rate': '4.997e-05', 'epoch': '0.1747', 'num_input_tokens_seen': 14204133, 'train_runtime': '7185', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '1.903', 'learning_rate': '4.997e-05', 'epoch': '0.1747', 'num_input_tokens_seen': 14206180, 'train_runtime': '7186', 'train_tokens_per_second': '1977'} +{'loss': '1.368', 'grad_norm': '2.09', 'learning_rate': '4.997e-05', 'epoch': '0.1748', 'num_input_tokens_seen': 14208227, 'train_runtime': '7187', 'train_tokens_per_second': '1977'} +{'loss': '0.5204', 'grad_norm': '1.15', 'learning_rate': '4.997e-05', 'epoch': '0.1748', 'num_input_tokens_seen': 14210274, 'train_runtime': '7188', 'train_tokens_per_second': '1977'} +{'loss': '0.2843', 'grad_norm': '0.8484', 'learning_rate': '4.997e-05', 'epoch': '0.1748', 'num_input_tokens_seen': 14212321, 'train_runtime': '7189', 'train_tokens_per_second': '1977'} +{'loss': '1.096', 'grad_norm': '1.953', 'learning_rate': '4.997e-05', 'epoch': '0.1748', 'num_input_tokens_seen': 14214368, 'train_runtime': '7190', 'train_tokens_per_second': '1977'} +{'loss': '0.707', 'grad_norm': '1.395', 'learning_rate': '4.997e-05', 'epoch': '0.1749', 'num_input_tokens_seen': 14216415, 'train_runtime': '7191', 'train_tokens_per_second': '1977'} +{'loss': '1.166', 'grad_norm': '1.947', 'learning_rate': '4.997e-05', 'epoch': '0.1749', 'num_input_tokens_seen': 14218462, 'train_runtime': '7192', 'train_tokens_per_second': '1977'} +{'loss': '1.298', 'grad_norm': '1.885', 'learning_rate': '4.997e-05', 'epoch': '0.1749', 'num_input_tokens_seen': 14220509, 'train_runtime': '7193', 'train_tokens_per_second': '1977'} +{'loss': '1.602', 'grad_norm': '2.269', 'learning_rate': '4.997e-05', 'epoch': '0.1749', 'num_input_tokens_seen': 14222556, 'train_runtime': '7194', 'train_tokens_per_second': '1977'} +{'loss': '0.7592', 'grad_norm': '1.411', 'learning_rate': '4.997e-05', 'epoch': '0.175', 'num_input_tokens_seen': 14224603, 'train_runtime': '7195', 'train_tokens_per_second': '1977'} +{'loss': '0.5728', 'grad_norm': '1.169', 'learning_rate': '4.997e-05', 'epoch': '0.175', 'num_input_tokens_seen': 14226650, 'train_runtime': '7196', 'train_tokens_per_second': '1977'} +{'loss': '1.448', 'grad_norm': '2.289', 'learning_rate': '4.997e-05', 'epoch': '0.175', 'num_input_tokens_seen': 14228697, 'train_runtime': '7197', 'train_tokens_per_second': '1977'} +{'loss': '0.7114', 'grad_norm': '1.014', 'learning_rate': '4.997e-05', 'epoch': '0.175', 'num_input_tokens_seen': 14230744, 'train_runtime': '7198', 'train_tokens_per_second': '1977'} +{'loss': '0.3465', 'grad_norm': '1.042', 'learning_rate': '4.997e-05', 'epoch': '0.1751', 'num_input_tokens_seen': 14232791, 'train_runtime': '7199', 'train_tokens_per_second': '1977'} +{'loss': '0.2606', 'grad_norm': '1.089', 'learning_rate': '4.997e-05', 'epoch': '0.1751', 'num_input_tokens_seen': 14234838, 'train_runtime': '7200', 'train_tokens_per_second': '1977'} +{'loss': '0.6736', 'grad_norm': '1.309', 'learning_rate': '4.997e-05', 'epoch': '0.1751', 'num_input_tokens_seen': 14236885, 'train_runtime': '7201', 'train_tokens_per_second': '1977'} +{'loss': '0.4857', 'grad_norm': '1.188', 'learning_rate': '4.997e-05', 'epoch': '0.1751', 'num_input_tokens_seen': 14238932, 'train_runtime': '7202', 'train_tokens_per_second': '1977'} +{'loss': '2.346', 'grad_norm': '2.977', 'learning_rate': '4.997e-05', 'epoch': '0.1752', 'num_input_tokens_seen': 14240979, 'train_runtime': '7203', 'train_tokens_per_second': '1977'} +{'loss': '0.3397', 'grad_norm': '1.135', 'learning_rate': '4.997e-05', 'epoch': '0.1752', 'num_input_tokens_seen': 14243026, 'train_runtime': '7204', 'train_tokens_per_second': '1977'} +{'loss': '2.128', 'grad_norm': '2.728', 'learning_rate': '4.997e-05', 'epoch': '0.1752', 'num_input_tokens_seen': 14245073, 'train_runtime': '7205', 'train_tokens_per_second': '1977'} +{'loss': '0.5742', 'grad_norm': '1.233', 'learning_rate': '4.997e-05', 'epoch': '0.1752', 'num_input_tokens_seen': 14247120, 'train_runtime': '7206', 'train_tokens_per_second': '1977'} +{'loss': '0.84', 'grad_norm': '1.276', 'learning_rate': '4.997e-05', 'epoch': '0.1753', 'num_input_tokens_seen': 14249167, 'train_runtime': '7207', 'train_tokens_per_second': '1977'} +{'loss': '2.155', 'grad_norm': '2.797', 'learning_rate': '4.997e-05', 'epoch': '0.1753', 'num_input_tokens_seen': 14251214, 'train_runtime': '7208', 'train_tokens_per_second': '1977'} +{'loss': '1.679', 'grad_norm': '2.435', 'learning_rate': '4.997e-05', 'epoch': '0.1753', 'num_input_tokens_seen': 14253261, 'train_runtime': '7209', 'train_tokens_per_second': '1977'} +{'loss': '1.836', 'grad_norm': '2.383', 'learning_rate': '4.997e-05', 'epoch': '0.1753', 'num_input_tokens_seen': 14255308, 'train_runtime': '7211', 'train_tokens_per_second': '1977'} +{'loss': '0.4984', 'grad_norm': '0.9693', 'learning_rate': '4.997e-05', 'epoch': '0.1754', 'num_input_tokens_seen': 14257355, 'train_runtime': '7212', 'train_tokens_per_second': '1977'} +{'loss': '0.5751', 'grad_norm': '1.22', 'learning_rate': '4.997e-05', 'epoch': '0.1754', 'num_input_tokens_seen': 14259402, 'train_runtime': '7213', 'train_tokens_per_second': '1977'} +{'loss': '0.3222', 'grad_norm': '0.8901', 'learning_rate': '4.997e-05', 'epoch': '0.1754', 'num_input_tokens_seen': 14261449, 'train_runtime': '7214', 'train_tokens_per_second': '1977'} +{'loss': '0.9247', 'grad_norm': '1.327', 'learning_rate': '4.997e-05', 'epoch': '0.1754', 'num_input_tokens_seen': 14263496, 'train_runtime': '7215', 'train_tokens_per_second': '1977'} +{'loss': '1.314', 'grad_norm': '1.769', 'learning_rate': '4.997e-05', 'epoch': '0.1755', 'num_input_tokens_seen': 14265543, 'train_runtime': '7216', 'train_tokens_per_second': '1977'} +{'loss': '0.4292', 'grad_norm': '0.9785', 'learning_rate': '4.997e-05', 'epoch': '0.1755', 'num_input_tokens_seen': 14267590, 'train_runtime': '7217', 'train_tokens_per_second': '1977'} +{'loss': '0.4355', 'grad_norm': '0.9159', 'learning_rate': '4.997e-05', 'epoch': '0.1755', 'num_input_tokens_seen': 14269637, 'train_runtime': '7218', 'train_tokens_per_second': '1977'} +{'loss': '0.6621', 'grad_norm': '1.283', 'learning_rate': '4.997e-05', 'epoch': '0.1755', 'num_input_tokens_seen': 14271684, 'train_runtime': '7219', 'train_tokens_per_second': '1977'} +{'loss': '0.6341', 'grad_norm': '1.531', 'learning_rate': '4.997e-05', 'epoch': '0.1756', 'num_input_tokens_seen': 14273731, 'train_runtime': '7220', 'train_tokens_per_second': '1977'} +{'loss': '0.3971', 'grad_norm': '0.9652', 'learning_rate': '4.997e-05', 'epoch': '0.1756', 'num_input_tokens_seen': 14275778, 'train_runtime': '7221', 'train_tokens_per_second': '1977'} +{'loss': '0.7165', 'grad_norm': '1.057', 'learning_rate': '4.997e-05', 'epoch': '0.1756', 'num_input_tokens_seen': 14277825, 'train_runtime': '7222', 'train_tokens_per_second': '1977'} +{'loss': '0.6064', 'grad_norm': '1.533', 'learning_rate': '4.997e-05', 'epoch': '0.1756', 'num_input_tokens_seen': 14279872, 'train_runtime': '7223', 'train_tokens_per_second': '1977'} +{'loss': '0.8481', 'grad_norm': '1.247', 'learning_rate': '4.997e-05', 'epoch': '0.1757', 'num_input_tokens_seen': 14281919, 'train_runtime': '7224', 'train_tokens_per_second': '1977'} +{'loss': '0.8584', 'grad_norm': '1.684', 'learning_rate': '4.997e-05', 'epoch': '0.1757', 'num_input_tokens_seen': 14283966, 'train_runtime': '7225', 'train_tokens_per_second': '1977'} +{'loss': '0.9229', 'grad_norm': '1.488', 'learning_rate': '4.997e-05', 'epoch': '0.1757', 'num_input_tokens_seen': 14286013, 'train_runtime': '7226', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.759', 'learning_rate': '4.997e-05', 'epoch': '0.1757', 'num_input_tokens_seen': 14288060, 'train_runtime': '7227', 'train_tokens_per_second': '1977'} +{'loss': '0.3308', 'grad_norm': '0.9257', 'learning_rate': '4.997e-05', 'epoch': '0.1758', 'num_input_tokens_seen': 14290107, 'train_runtime': '7228', 'train_tokens_per_second': '1977'} +{'loss': '0.3587', 'grad_norm': '0.9118', 'learning_rate': '4.997e-05', 'epoch': '0.1758', 'num_input_tokens_seen': 14292154, 'train_runtime': '7229', 'train_tokens_per_second': '1977'} +{'loss': '1.304', 'grad_norm': '1.714', 'learning_rate': '4.997e-05', 'epoch': '0.1758', 'num_input_tokens_seen': 14294201, 'train_runtime': '7230', 'train_tokens_per_second': '1977'} +{'loss': '0.6544', 'grad_norm': '1.31', 'learning_rate': '4.997e-05', 'epoch': '0.1758', 'num_input_tokens_seen': 14296248, 'train_runtime': '7231', 'train_tokens_per_second': '1977'} +{'loss': '2.126', 'grad_norm': '2.555', 'learning_rate': '4.997e-05', 'epoch': '0.1759', 'num_input_tokens_seen': 14298295, 'train_runtime': '7232', 'train_tokens_per_second': '1977'} +{'loss': '0.6104', 'grad_norm': '1.477', 'learning_rate': '4.997e-05', 'epoch': '0.1759', 'num_input_tokens_seen': 14300342, 'train_runtime': '7233', 'train_tokens_per_second': '1977'} +{'loss': '1.492', 'grad_norm': '2.38', 'learning_rate': '4.997e-05', 'epoch': '0.1759', 'num_input_tokens_seen': 14302389, 'train_runtime': '7234', 'train_tokens_per_second': '1977'} +{'loss': '1.345', 'grad_norm': '2.068', 'learning_rate': '4.997e-05', 'epoch': '0.1759', 'num_input_tokens_seen': 14304436, 'train_runtime': '7235', 'train_tokens_per_second': '1977'} +{'loss': '2.197', 'grad_norm': '3.466', 'learning_rate': '4.997e-05', 'epoch': '0.176', 'num_input_tokens_seen': 14306483, 'train_runtime': '7236', 'train_tokens_per_second': '1977'} +{'loss': '0.7602', 'grad_norm': '1.624', 'learning_rate': '4.997e-05', 'epoch': '0.176', 'num_input_tokens_seen': 14308530, 'train_runtime': '7237', 'train_tokens_per_second': '1977'} +{'loss': '1.767', 'grad_norm': '2.403', 'learning_rate': '4.997e-05', 'epoch': '0.176', 'num_input_tokens_seen': 14310577, 'train_runtime': '7238', 'train_tokens_per_second': '1977'} +{'loss': '1.303', 'grad_norm': '2.032', 'learning_rate': '4.997e-05', 'epoch': '0.176', 'num_input_tokens_seen': 14312624, 'train_runtime': '7240', 'train_tokens_per_second': '1977'} +{'loss': '0.7519', 'grad_norm': '1.062', 'learning_rate': '4.997e-05', 'epoch': '0.1761', 'num_input_tokens_seen': 14314671, 'train_runtime': '7241', 'train_tokens_per_second': '1977'} +{'loss': '0.3101', 'grad_norm': '1.077', 'learning_rate': '4.997e-05', 'epoch': '0.1761', 'num_input_tokens_seen': 14316718, 'train_runtime': '7242', 'train_tokens_per_second': '1977'} +{'loss': '0.8229', 'grad_norm': '1.303', 'learning_rate': '4.997e-05', 'epoch': '0.1761', 'num_input_tokens_seen': 14318765, 'train_runtime': '7243', 'train_tokens_per_second': '1977'} +{'loss': '0.6877', 'grad_norm': '1.303', 'learning_rate': '4.997e-05', 'epoch': '0.1761', 'num_input_tokens_seen': 14320812, 'train_runtime': '7244', 'train_tokens_per_second': '1977'} +{'loss': '0.6992', 'grad_norm': '0.9852', 'learning_rate': '4.997e-05', 'epoch': '0.1762', 'num_input_tokens_seen': 14322859, 'train_runtime': '7245', 'train_tokens_per_second': '1977'} +{'loss': '0.3298', 'grad_norm': '1.05', 'learning_rate': '4.997e-05', 'epoch': '0.1762', 'num_input_tokens_seen': 14324906, 'train_runtime': '7246', 'train_tokens_per_second': '1977'} +{'loss': '0.836', 'grad_norm': '1.378', 'learning_rate': '4.997e-05', 'epoch': '0.1762', 'num_input_tokens_seen': 14326953, 'train_runtime': '7247', 'train_tokens_per_second': '1977'} +{'loss': '0.9889', 'grad_norm': '1.384', 'learning_rate': '4.997e-05', 'epoch': '0.1762', 'num_input_tokens_seen': 14329000, 'train_runtime': '7248', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 04:38:13,291 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 04:38:13,291 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 04:38:13,639 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-7000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 04:38:13,647 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-7000/tokenizer_config.json + +{'loss': '0.8041', 'grad_norm': '1.398', 'learning_rate': '4.997e-05', 'epoch': '0.1763', 'num_input_tokens_seen': 14331047, 'train_runtime': '7249', 'train_tokens_per_second': '1977'} +{'loss': '0.3118', 'grad_norm': '1.12', 'learning_rate': '4.997e-05', 'epoch': '0.1763', 'num_input_tokens_seen': 14333094, 'train_runtime': '7250', 'train_tokens_per_second': '1977'} +{'loss': '0.3629', 'grad_norm': '1.108', 'learning_rate': '4.997e-05', 'epoch': '0.1763', 'num_input_tokens_seen': 14335141, 'train_runtime': '7251', 'train_tokens_per_second': '1977'} +{'loss': '0.899', 'grad_norm': '1.513', 'learning_rate': '4.997e-05', 'epoch': '0.1763', 'num_input_tokens_seen': 14337188, 'train_runtime': '7252', 'train_tokens_per_second': '1977'} +{'loss': '1.17', 'grad_norm': '1.857', 'learning_rate': '4.997e-05', 'epoch': '0.1764', 'num_input_tokens_seen': 14339235, 'train_runtime': '7254', 'train_tokens_per_second': '1977'} +{'loss': '0.9398', 'grad_norm': '1.453', 'learning_rate': '4.997e-05', 'epoch': '0.1764', 'num_input_tokens_seen': 14341282, 'train_runtime': '7255', 'train_tokens_per_second': '1977'} +{'loss': '0.7024', 'grad_norm': '1.259', 'learning_rate': '4.997e-05', 'epoch': '0.1764', 'num_input_tokens_seen': 14343329, 'train_runtime': '7256', 'train_tokens_per_second': '1977'} +{'loss': '2.081', 'grad_norm': '2.443', 'learning_rate': '4.997e-05', 'epoch': '0.1764', 'num_input_tokens_seen': 14345376, 'train_runtime': '7257', 'train_tokens_per_second': '1977'} +{'loss': '0.7599', 'grad_norm': '1.13', 'learning_rate': '4.997e-05', 'epoch': '0.1765', 'num_input_tokens_seen': 14347423, 'train_runtime': '7258', 'train_tokens_per_second': '1977'} +{'loss': '0.8122', 'grad_norm': '1.693', 'learning_rate': '4.997e-05', 'epoch': '0.1765', 'num_input_tokens_seen': 14349470, 'train_runtime': '7259', 'train_tokens_per_second': '1977'} +{'loss': '0.6228', 'grad_norm': '0.958', 'learning_rate': '4.997e-05', 'epoch': '0.1765', 'num_input_tokens_seen': 14351517, 'train_runtime': '7260', 'train_tokens_per_second': '1977'} +{'loss': '0.4491', 'grad_norm': '1.091', 'learning_rate': '4.997e-05', 'epoch': '0.1765', 'num_input_tokens_seen': 14353564, 'train_runtime': '7261', 'train_tokens_per_second': '1977'} +{'loss': '0.2292', 'grad_norm': '0.8883', 'learning_rate': '4.997e-05', 'epoch': '0.1766', 'num_input_tokens_seen': 14355611, 'train_runtime': '7262', 'train_tokens_per_second': '1977'} +{'loss': '1.064', 'grad_norm': '1.478', 'learning_rate': '4.997e-05', 'epoch': '0.1766', 'num_input_tokens_seen': 14357658, 'train_runtime': '7263', 'train_tokens_per_second': '1977'} +{'loss': '0.6111', 'grad_norm': '0.9622', 'learning_rate': '4.997e-05', 'epoch': '0.1766', 'num_input_tokens_seen': 14359705, 'train_runtime': '7264', 'train_tokens_per_second': '1977'} +{'loss': '0.5339', 'grad_norm': '1.496', 'learning_rate': '4.997e-05', 'epoch': '0.1766', 'num_input_tokens_seen': 14361752, 'train_runtime': '7265', 'train_tokens_per_second': '1977'} +{'loss': '0.6719', 'grad_norm': '1.445', 'learning_rate': '4.997e-05', 'epoch': '0.1767', 'num_input_tokens_seen': 14363799, 'train_runtime': '7266', 'train_tokens_per_second': '1977'} +{'loss': '0.7355', 'grad_norm': '1.38', 'learning_rate': '4.997e-05', 'epoch': '0.1767', 'num_input_tokens_seen': 14365846, 'train_runtime': '7267', 'train_tokens_per_second': '1977'} +{'loss': '0.8622', 'grad_norm': '1.69', 'learning_rate': '4.997e-05', 'epoch': '0.1767', 'num_input_tokens_seen': 14367893, 'train_runtime': '7268', 'train_tokens_per_second': '1977'} +{'loss': '0.2663', 'grad_norm': '0.9604', 'learning_rate': '4.997e-05', 'epoch': '0.1768', 'num_input_tokens_seen': 14369940, 'train_runtime': '7269', 'train_tokens_per_second': '1977'} +{'loss': '1.803', 'grad_norm': '2.07', 'learning_rate': '4.997e-05', 'epoch': '0.1768', 'num_input_tokens_seen': 14371987, 'train_runtime': '7270', 'train_tokens_per_second': '1977'} +{'loss': '1.613', 'grad_norm': '2.292', 'learning_rate': '4.997e-05', 'epoch': '0.1768', 'num_input_tokens_seen': 14374034, 'train_runtime': '7271', 'train_tokens_per_second': '1977'} +{'loss': '0.8128', 'grad_norm': '1.583', 'learning_rate': '4.997e-05', 'epoch': '0.1768', 'num_input_tokens_seen': 14376081, 'train_runtime': '7272', 'train_tokens_per_second': '1977'} +{'loss': '0.7325', 'grad_norm': '1.277', 'learning_rate': '4.997e-05', 'epoch': '0.1769', 'num_input_tokens_seen': 14378128, 'train_runtime': '7273', 'train_tokens_per_second': '1977'} +{'loss': '0.4378', 'grad_norm': '1.122', 'learning_rate': '4.997e-05', 'epoch': '0.1769', 'num_input_tokens_seen': 14380175, 'train_runtime': '7274', 'train_tokens_per_second': '1977'} +{'loss': '0.8199', 'grad_norm': '1.81', 'learning_rate': '4.997e-05', 'epoch': '0.1769', 'num_input_tokens_seen': 14382222, 'train_runtime': '7275', 'train_tokens_per_second': '1977'} +{'loss': '1.499', 'grad_norm': '2.335', 'learning_rate': '4.997e-05', 'epoch': '0.1769', 'num_input_tokens_seen': 14384269, 'train_runtime': '7276', 'train_tokens_per_second': '1977'} +{'loss': '1.384', 'grad_norm': '2.052', 'learning_rate': '4.997e-05', 'epoch': '0.177', 'num_input_tokens_seen': 14386316, 'train_runtime': '7277', 'train_tokens_per_second': '1977'} +{'loss': '1.732', 'grad_norm': '2.461', 'learning_rate': '4.997e-05', 'epoch': '0.177', 'num_input_tokens_seen': 14388363, 'train_runtime': '7278', 'train_tokens_per_second': '1977'} +{'loss': '0.3188', 'grad_norm': '0.8918', 'learning_rate': '4.997e-05', 'epoch': '0.177', 'num_input_tokens_seen': 14390410, 'train_runtime': '7279', 'train_tokens_per_second': '1977'} +{'loss': '0.2868', 'grad_norm': '0.8722', 'learning_rate': '4.997e-05', 'epoch': '0.177', 'num_input_tokens_seen': 14392457, 'train_runtime': '7280', 'train_tokens_per_second': '1977'} +{'loss': '1.61', 'grad_norm': '3.185', 'learning_rate': '4.997e-05', 'epoch': '0.1771', 'num_input_tokens_seen': 14394504, 'train_runtime': '7281', 'train_tokens_per_second': '1977'} +{'loss': '0.3456', 'grad_norm': '1.052', 'learning_rate': '4.997e-05', 'epoch': '0.1771', 'num_input_tokens_seen': 14396551, 'train_runtime': '7282', 'train_tokens_per_second': '1977'} +{'loss': '1.833', 'grad_norm': '2.127', 'learning_rate': '4.997e-05', 'epoch': '0.1771', 'num_input_tokens_seen': 14398598, 'train_runtime': '7284', 'train_tokens_per_second': '1977'} +{'loss': '0.3316', 'grad_norm': '0.8273', 'learning_rate': '4.997e-05', 'epoch': '0.1771', 'num_input_tokens_seen': 14400645, 'train_runtime': '7285', 'train_tokens_per_second': '1977'} +{'loss': '1.074', 'grad_norm': '1.649', 'learning_rate': '4.997e-05', 'epoch': '0.1772', 'num_input_tokens_seen': 14402692, 'train_runtime': '7286', 'train_tokens_per_second': '1977'} +{'loss': '0.7744', 'grad_norm': '1.492', 'learning_rate': '4.997e-05', 'epoch': '0.1772', 'num_input_tokens_seen': 14404739, 'train_runtime': '7287', 'train_tokens_per_second': '1977'} +{'loss': '0.5276', 'grad_norm': '1.199', 'learning_rate': '4.997e-05', 'epoch': '0.1772', 'num_input_tokens_seen': 14406786, 'train_runtime': '7288', 'train_tokens_per_second': '1977'} +{'loss': '1.019', 'grad_norm': '1.768', 'learning_rate': '4.997e-05', 'epoch': '0.1772', 'num_input_tokens_seen': 14408833, 'train_runtime': '7289', 'train_tokens_per_second': '1977'} +{'loss': '0.5867', 'grad_norm': '1.369', 'learning_rate': '4.997e-05', 'epoch': '0.1773', 'num_input_tokens_seen': 14410880, 'train_runtime': '7290', 'train_tokens_per_second': '1977'} +{'loss': '0.5097', 'grad_norm': '1.061', 'learning_rate': '4.997e-05', 'epoch': '0.1773', 'num_input_tokens_seen': 14412927, 'train_runtime': '7291', 'train_tokens_per_second': '1977'} +{'loss': '1.754', 'grad_norm': '2.281', 'learning_rate': '4.997e-05', 'epoch': '0.1773', 'num_input_tokens_seen': 14414974, 'train_runtime': '7292', 'train_tokens_per_second': '1977'} +{'loss': '0.9108', 'grad_norm': '1.381', 'learning_rate': '4.997e-05', 'epoch': '0.1773', 'num_input_tokens_seen': 14417021, 'train_runtime': '7293', 'train_tokens_per_second': '1977'} +{'loss': '1.558', 'grad_norm': '2.286', 'learning_rate': '4.997e-05', 'epoch': '0.1774', 'num_input_tokens_seen': 14419068, 'train_runtime': '7294', 'train_tokens_per_second': '1977'} +{'loss': '2.01', 'grad_norm': '2.417', 'learning_rate': '4.997e-05', 'epoch': '0.1774', 'num_input_tokens_seen': 14421115, 'train_runtime': '7295', 'train_tokens_per_second': '1977'} +{'loss': '0.9373', 'grad_norm': '1.48', 'learning_rate': '4.997e-05', 'epoch': '0.1774', 'num_input_tokens_seen': 14423162, 'train_runtime': '7296', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '2.256', 'learning_rate': '4.997e-05', 'epoch': '0.1774', 'num_input_tokens_seen': 14425209, 'train_runtime': '7297', 'train_tokens_per_second': '1977'} +{'loss': '0.3583', 'grad_norm': '0.9849', 'learning_rate': '4.997e-05', 'epoch': '0.1775', 'num_input_tokens_seen': 14427256, 'train_runtime': '7298', 'train_tokens_per_second': '1977'} +{'loss': '1.245', 'grad_norm': '2.355', 'learning_rate': '4.997e-05', 'epoch': '0.1775', 'num_input_tokens_seen': 14429303, 'train_runtime': '7299', 'train_tokens_per_second': '1977'} +{'loss': '1.699', 'grad_norm': '2.904', 'learning_rate': '4.997e-05', 'epoch': '0.1775', 'num_input_tokens_seen': 14431350, 'train_runtime': '7300', 'train_tokens_per_second': '1977'} +{'loss': '0.839', 'grad_norm': '1.565', 'learning_rate': '4.997e-05', 'epoch': '0.1775', 'num_input_tokens_seen': 14433397, 'train_runtime': '7301', 'train_tokens_per_second': '1977'} +{'loss': '1.879', 'grad_norm': '2.488', 'learning_rate': '4.997e-05', 'epoch': '0.1776', 'num_input_tokens_seen': 14435444, 'train_runtime': '7302', 'train_tokens_per_second': '1977'} +{'loss': '0.568', 'grad_norm': '1.098', 'learning_rate': '4.997e-05', 'epoch': '0.1776', 'num_input_tokens_seen': 14437491, 'train_runtime': '7303', 'train_tokens_per_second': '1977'} +{'loss': '0.3182', 'grad_norm': '1.214', 'learning_rate': '4.997e-05', 'epoch': '0.1776', 'num_input_tokens_seen': 14439538, 'train_runtime': '7304', 'train_tokens_per_second': '1977'} +{'loss': '0.8879', 'grad_norm': '1.552', 'learning_rate': '4.997e-05', 'epoch': '0.1776', 'num_input_tokens_seen': 14441585, 'train_runtime': '7305', 'train_tokens_per_second': '1977'} +{'loss': '0.6917', 'grad_norm': '1.555', 'learning_rate': '4.997e-05', 'epoch': '0.1777', 'num_input_tokens_seen': 14443632, 'train_runtime': '7306', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.717', 'learning_rate': '4.997e-05', 'epoch': '0.1777', 'num_input_tokens_seen': 14445679, 'train_runtime': '7307', 'train_tokens_per_second': '1977'} +{'loss': '0.7406', 'grad_norm': '1.393', 'learning_rate': '4.997e-05', 'epoch': '0.1777', 'num_input_tokens_seen': 14447726, 'train_runtime': '7308', 'train_tokens_per_second': '1977'} +{'loss': '0.3216', 'grad_norm': '0.909', 'learning_rate': '4.997e-05', 'epoch': '0.1777', 'num_input_tokens_seen': 14449773, 'train_runtime': '7309', 'train_tokens_per_second': '1977'} +{'loss': '1.669', 'grad_norm': '2.316', 'learning_rate': '4.997e-05', 'epoch': '0.1778', 'num_input_tokens_seen': 14451820, 'train_runtime': '7310', 'train_tokens_per_second': '1977'} +{'loss': '0.6701', 'grad_norm': '1.253', 'learning_rate': '4.997e-05', 'epoch': '0.1778', 'num_input_tokens_seen': 14453867, 'train_runtime': '7311', 'train_tokens_per_second': '1977'} +{'loss': '1.34', 'grad_norm': '2.093', 'learning_rate': '4.997e-05', 'epoch': '0.1778', 'num_input_tokens_seen': 14455914, 'train_runtime': '7312', 'train_tokens_per_second': '1977'} +{'loss': '0.3445', 'grad_norm': '0.9086', 'learning_rate': '4.997e-05', 'epoch': '0.1778', 'num_input_tokens_seen': 14457961, 'train_runtime': '7313', 'train_tokens_per_second': '1977'} +{'loss': '0.7437', 'grad_norm': '1.176', 'learning_rate': '4.997e-05', 'epoch': '0.1779', 'num_input_tokens_seen': 14460008, 'train_runtime': '7315', 'train_tokens_per_second': '1977'} +{'loss': '1.776', 'grad_norm': '2.384', 'learning_rate': '4.997e-05', 'epoch': '0.1779', 'num_input_tokens_seen': 14462055, 'train_runtime': '7316', 'train_tokens_per_second': '1977'} +{'loss': '0.4611', 'grad_norm': '1.131', 'learning_rate': '4.997e-05', 'epoch': '0.1779', 'num_input_tokens_seen': 14464102, 'train_runtime': '7317', 'train_tokens_per_second': '1977'} +{'loss': '0.7341', 'grad_norm': '1.389', 'learning_rate': '4.997e-05', 'epoch': '0.1779', 'num_input_tokens_seen': 14466149, 'train_runtime': '7318', 'train_tokens_per_second': '1977'} +{'loss': '0.6598', 'grad_norm': '0.9511', 'learning_rate': '4.997e-05', 'epoch': '0.178', 'num_input_tokens_seen': 14468196, 'train_runtime': '7319', 'train_tokens_per_second': '1977'} +{'loss': '0.3625', 'grad_norm': '1.15', 'learning_rate': '4.997e-05', 'epoch': '0.178', 'num_input_tokens_seen': 14470243, 'train_runtime': '7320', 'train_tokens_per_second': '1977'} +{'loss': '1.739', 'grad_norm': '2.218', 'learning_rate': '4.997e-05', 'epoch': '0.178', 'num_input_tokens_seen': 14472290, 'train_runtime': '7321', 'train_tokens_per_second': '1977'} +{'loss': '0.3599', 'grad_norm': '0.9132', 'learning_rate': '4.997e-05', 'epoch': '0.178', 'num_input_tokens_seen': 14474337, 'train_runtime': '7322', 'train_tokens_per_second': '1977'} +{'loss': '1.729', 'grad_norm': '2.236', 'learning_rate': '4.997e-05', 'epoch': '0.1781', 'num_input_tokens_seen': 14476384, 'train_runtime': '7323', 'train_tokens_per_second': '1977'} +{'loss': '1.071', 'grad_norm': '1.758', 'learning_rate': '4.997e-05', 'epoch': '0.1781', 'num_input_tokens_seen': 14478431, 'train_runtime': '7324', 'train_tokens_per_second': '1977'} +{'loss': '0.7538', 'grad_norm': '1.477', 'learning_rate': '4.997e-05', 'epoch': '0.1781', 'num_input_tokens_seen': 14480478, 'train_runtime': '7325', 'train_tokens_per_second': '1977'} +{'loss': '0.3837', 'grad_norm': '1.037', 'learning_rate': '4.997e-05', 'epoch': '0.1781', 'num_input_tokens_seen': 14482525, 'train_runtime': '7326', 'train_tokens_per_second': '1977'} +{'loss': '1.764', 'grad_norm': '2.276', 'learning_rate': '4.997e-05', 'epoch': '0.1782', 'num_input_tokens_seen': 14484572, 'train_runtime': '7327', 'train_tokens_per_second': '1977'} +{'loss': '1.614', 'grad_norm': '2.172', 'learning_rate': '4.997e-05', 'epoch': '0.1782', 'num_input_tokens_seen': 14486619, 'train_runtime': '7328', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '2.182', 'learning_rate': '4.997e-05', 'epoch': '0.1782', 'num_input_tokens_seen': 14488666, 'train_runtime': '7329', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '1.651', 'learning_rate': '4.997e-05', 'epoch': '0.1782', 'num_input_tokens_seen': 14490713, 'train_runtime': '7330', 'train_tokens_per_second': '1977'} +{'loss': '0.6725', 'grad_norm': '1.416', 'learning_rate': '4.997e-05', 'epoch': '0.1783', 'num_input_tokens_seen': 14492760, 'train_runtime': '7331', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '2.12', 'learning_rate': '4.997e-05', 'epoch': '0.1783', 'num_input_tokens_seen': 14494807, 'train_runtime': '7332', 'train_tokens_per_second': '1977'} +{'loss': '0.4583', 'grad_norm': '0.9692', 'learning_rate': '4.997e-05', 'epoch': '0.1783', 'num_input_tokens_seen': 14496854, 'train_runtime': '7333', 'train_tokens_per_second': '1977'} +{'loss': '0.3325', 'grad_norm': '0.9607', 'learning_rate': '4.997e-05', 'epoch': '0.1783', 'num_input_tokens_seen': 14498901, 'train_runtime': '7334', 'train_tokens_per_second': '1977'} +{'loss': '0.5414', 'grad_norm': '1.066', 'learning_rate': '4.997e-05', 'epoch': '0.1784', 'num_input_tokens_seen': 14500948, 'train_runtime': '7335', 'train_tokens_per_second': '1977'} +{'loss': '0.7308', 'grad_norm': '1.307', 'learning_rate': '4.997e-05', 'epoch': '0.1784', 'num_input_tokens_seen': 14502995, 'train_runtime': '7336', 'train_tokens_per_second': '1977'} +{'loss': '0.3432', 'grad_norm': '0.9268', 'learning_rate': '4.997e-05', 'epoch': '0.1784', 'num_input_tokens_seen': 14505042, 'train_runtime': '7337', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '1.839', 'learning_rate': '4.997e-05', 'epoch': '0.1784', 'num_input_tokens_seen': 14507089, 'train_runtime': '7338', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '1.659', 'learning_rate': '4.997e-05', 'epoch': '0.1785', 'num_input_tokens_seen': 14509136, 'train_runtime': '7339', 'train_tokens_per_second': '1977'} +{'loss': '0.2744', 'grad_norm': '0.9695', 'learning_rate': '4.997e-05', 'epoch': '0.1785', 'num_input_tokens_seen': 14511183, 'train_runtime': '7340', 'train_tokens_per_second': '1977'} +{'loss': '0.7888', 'grad_norm': '1.201', 'learning_rate': '4.997e-05', 'epoch': '0.1785', 'num_input_tokens_seen': 14513230, 'train_runtime': '7341', 'train_tokens_per_second': '1977'} +{'loss': '0.4987', 'grad_norm': '1.024', 'learning_rate': '4.997e-05', 'epoch': '0.1785', 'num_input_tokens_seen': 14515277, 'train_runtime': '7343', 'train_tokens_per_second': '1977'} +{'loss': '0.9115', 'grad_norm': '1.64', 'learning_rate': '4.997e-05', 'epoch': '0.1786', 'num_input_tokens_seen': 14517324, 'train_runtime': '7344', 'train_tokens_per_second': '1977'} +{'loss': '0.9269', 'grad_norm': '1.449', 'learning_rate': '4.997e-05', 'epoch': '0.1786', 'num_input_tokens_seen': 14519371, 'train_runtime': '7345', 'train_tokens_per_second': '1977'} +{'loss': '0.7401', 'grad_norm': '1.433', 'learning_rate': '4.997e-05', 'epoch': '0.1786', 'num_input_tokens_seen': 14521418, 'train_runtime': '7346', 'train_tokens_per_second': '1977'} +{'loss': '0.36', 'grad_norm': '0.7681', 'learning_rate': '4.997e-05', 'epoch': '0.1786', 'num_input_tokens_seen': 14523465, 'train_runtime': '7347', 'train_tokens_per_second': '1977'} +{'loss': '1.128', 'grad_norm': '1.713', 'learning_rate': '4.997e-05', 'epoch': '0.1787', 'num_input_tokens_seen': 14525512, 'train_runtime': '7348', 'train_tokens_per_second': '1977'} +{'loss': '0.6218', 'grad_norm': '1.212', 'learning_rate': '4.997e-05', 'epoch': '0.1787', 'num_input_tokens_seen': 14527559, 'train_runtime': '7349', 'train_tokens_per_second': '1977'} +{'loss': '0.5804', 'grad_norm': '1.133', 'learning_rate': '4.997e-05', 'epoch': '0.1787', 'num_input_tokens_seen': 14529606, 'train_runtime': '7350', 'train_tokens_per_second': '1977'} +{'loss': '0.6777', 'grad_norm': '1.05', 'learning_rate': '4.997e-05', 'epoch': '0.1787', 'num_input_tokens_seen': 14531653, 'train_runtime': '7351', 'train_tokens_per_second': '1977'} +{'loss': '0.7724', 'grad_norm': '1.503', 'learning_rate': '4.997e-05', 'epoch': '0.1788', 'num_input_tokens_seen': 14533700, 'train_runtime': '7352', 'train_tokens_per_second': '1977'} +{'loss': '1.87', 'grad_norm': '2.275', 'learning_rate': '4.997e-05', 'epoch': '0.1788', 'num_input_tokens_seen': 14535747, 'train_runtime': '7353', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '2.049', 'learning_rate': '4.997e-05', 'epoch': '0.1788', 'num_input_tokens_seen': 14537794, 'train_runtime': '7354', 'train_tokens_per_second': '1977'} +{'loss': '0.859', 'grad_norm': '1.319', 'learning_rate': '4.997e-05', 'epoch': '0.1788', 'num_input_tokens_seen': 14539841, 'train_runtime': '7355', 'train_tokens_per_second': '1977'} +{'loss': '0.4547', 'grad_norm': '1.108', 'learning_rate': '4.997e-05', 'epoch': '0.1789', 'num_input_tokens_seen': 14541888, 'train_runtime': '7356', 'train_tokens_per_second': '1977'} +{'loss': '0.3419', 'grad_norm': '0.9977', 'learning_rate': '4.997e-05', 'epoch': '0.1789', 'num_input_tokens_seen': 14543935, 'train_runtime': '7357', 'train_tokens_per_second': '1977'} +{'loss': '0.8932', 'grad_norm': '1.535', 'learning_rate': '4.997e-05', 'epoch': '0.1789', 'num_input_tokens_seen': 14545982, 'train_runtime': '7358', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '1.562', 'learning_rate': '4.997e-05', 'epoch': '0.1789', 'num_input_tokens_seen': 14548029, 'train_runtime': '7359', 'train_tokens_per_second': '1977'} +{'loss': '0.7066', 'grad_norm': '1.305', 'learning_rate': '4.997e-05', 'epoch': '0.179', 'num_input_tokens_seen': 14550076, 'train_runtime': '7360', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '1.798', 'learning_rate': '4.997e-05', 'epoch': '0.179', 'num_input_tokens_seen': 14552123, 'train_runtime': '7361', 'train_tokens_per_second': '1977'} +{'loss': '1.343', 'grad_norm': '2.246', 'learning_rate': '4.997e-05', 'epoch': '0.179', 'num_input_tokens_seen': 14554170, 'train_runtime': '7362', 'train_tokens_per_second': '1977'} +{'loss': '0.8304', 'grad_norm': '1.524', 'learning_rate': '4.997e-05', 'epoch': '0.179', 'num_input_tokens_seen': 14556217, 'train_runtime': '7363', 'train_tokens_per_second': '1977'} +{'loss': '0.5566', 'grad_norm': '1.216', 'learning_rate': '4.997e-05', 'epoch': '0.1791', 'num_input_tokens_seen': 14558264, 'train_runtime': '7364', 'train_tokens_per_second': '1977'} +{'loss': '0.8497', 'grad_norm': '1.826', 'learning_rate': '4.997e-05', 'epoch': '0.1791', 'num_input_tokens_seen': 14560311, 'train_runtime': '7365', 'train_tokens_per_second': '1977'} +{'loss': '0.3708', 'grad_norm': '1.014', 'learning_rate': '4.997e-05', 'epoch': '0.1791', 'num_input_tokens_seen': 14562358, 'train_runtime': '7366', 'train_tokens_per_second': '1977'} +{'loss': '0.3055', 'grad_norm': '1.054', 'learning_rate': '4.997e-05', 'epoch': '0.1791', 'num_input_tokens_seen': 14564405, 'train_runtime': '7367', 'train_tokens_per_second': '1977'} +{'loss': '0.509', 'grad_norm': '1.197', 'learning_rate': '4.997e-05', 'epoch': '0.1792', 'num_input_tokens_seen': 14566452, 'train_runtime': '7368', 'train_tokens_per_second': '1977'} +{'loss': '0.3539', 'grad_norm': '0.9346', 'learning_rate': '4.997e-05', 'epoch': '0.1792', 'num_input_tokens_seen': 14568499, 'train_runtime': '7369', 'train_tokens_per_second': '1977'} +{'loss': '0.3676', 'grad_norm': '0.993', 'learning_rate': '4.997e-05', 'epoch': '0.1792', 'num_input_tokens_seen': 14570546, 'train_runtime': '7371', 'train_tokens_per_second': '1977'} +{'loss': '0.3721', 'grad_norm': '0.8088', 'learning_rate': '4.997e-05', 'epoch': '0.1792', 'num_input_tokens_seen': 14572593, 'train_runtime': '7372', 'train_tokens_per_second': '1977'} +{'loss': '2.297', 'grad_norm': '3.03', 'learning_rate': '4.997e-05', 'epoch': '0.1793', 'num_input_tokens_seen': 14574640, 'train_runtime': '7373', 'train_tokens_per_second': '1977'} +{'loss': '1.073', 'grad_norm': '1.533', 'learning_rate': '4.997e-05', 'epoch': '0.1793', 'num_input_tokens_seen': 14576687, 'train_runtime': '7374', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.341', 'learning_rate': '4.997e-05', 'epoch': '0.1793', 'num_input_tokens_seen': 14578734, 'train_runtime': '7375', 'train_tokens_per_second': '1977'} +{'loss': '0.9488', 'grad_norm': '1.905', 'learning_rate': '4.997e-05', 'epoch': '0.1793', 'num_input_tokens_seen': 14580781, 'train_runtime': '7376', 'train_tokens_per_second': '1977'} +{'loss': '0.7282', 'grad_norm': '1.243', 'learning_rate': '4.997e-05', 'epoch': '0.1794', 'num_input_tokens_seen': 14582828, 'train_runtime': '7377', 'train_tokens_per_second': '1977'} +{'loss': '0.565', 'grad_norm': '1.141', 'learning_rate': '4.997e-05', 'epoch': '0.1794', 'num_input_tokens_seen': 14584875, 'train_runtime': '7378', 'train_tokens_per_second': '1977'} +{'loss': '0.4204', 'grad_norm': '1.004', 'learning_rate': '4.997e-05', 'epoch': '0.1794', 'num_input_tokens_seen': 14586922, 'train_runtime': '7379', 'train_tokens_per_second': '1977'} +{'loss': '1.289', 'grad_norm': '2.01', 'learning_rate': '4.997e-05', 'epoch': '0.1794', 'num_input_tokens_seen': 14588969, 'train_runtime': '7380', 'train_tokens_per_second': '1977'} +{'loss': '0.6167', 'grad_norm': '1.335', 'learning_rate': '4.997e-05', 'epoch': '0.1795', 'num_input_tokens_seen': 14591016, 'train_runtime': '7381', 'train_tokens_per_second': '1977'} +{'loss': '0.5759', 'grad_norm': '1.119', 'learning_rate': '4.997e-05', 'epoch': '0.1795', 'num_input_tokens_seen': 14593063, 'train_runtime': '7382', 'train_tokens_per_second': '1977'} +{'loss': '1.378', 'grad_norm': '1.831', 'learning_rate': '4.997e-05', 'epoch': '0.1795', 'num_input_tokens_seen': 14595110, 'train_runtime': '7383', 'train_tokens_per_second': '1977'} +{'loss': '0.7702', 'grad_norm': '1.344', 'learning_rate': '4.997e-05', 'epoch': '0.1795', 'num_input_tokens_seen': 14597157, 'train_runtime': '7384', 'train_tokens_per_second': '1977'} +{'loss': '1.882', 'grad_norm': '2.362', 'learning_rate': '4.997e-05', 'epoch': '0.1796', 'num_input_tokens_seen': 14599204, 'train_runtime': '7385', 'train_tokens_per_second': '1977'} +{'loss': '0.4026', 'grad_norm': '1.042', 'learning_rate': '4.997e-05', 'epoch': '0.1796', 'num_input_tokens_seen': 14601251, 'train_runtime': '7386', 'train_tokens_per_second': '1977'} +{'loss': '2.102', 'grad_norm': '2.109', 'learning_rate': '4.997e-05', 'epoch': '0.1796', 'num_input_tokens_seen': 14603298, 'train_runtime': '7387', 'train_tokens_per_second': '1977'} +{'loss': '0.4477', 'grad_norm': '0.9553', 'learning_rate': '4.997e-05', 'epoch': '0.1796', 'num_input_tokens_seen': 14605345, 'train_runtime': '7388', 'train_tokens_per_second': '1977'} +{'loss': '0.9317', 'grad_norm': '1.38', 'learning_rate': '4.997e-05', 'epoch': '0.1797', 'num_input_tokens_seen': 14607392, 'train_runtime': '7389', 'train_tokens_per_second': '1977'} +{'loss': '0.795', 'grad_norm': '1.915', 'learning_rate': '4.997e-05', 'epoch': '0.1797', 'num_input_tokens_seen': 14609439, 'train_runtime': '7390', 'train_tokens_per_second': '1977'} +{'loss': '1.421', 'grad_norm': '1.749', 'learning_rate': '4.997e-05', 'epoch': '0.1797', 'num_input_tokens_seen': 14611486, 'train_runtime': '7391', 'train_tokens_per_second': '1977'} +{'loss': '0.3982', 'grad_norm': '0.9601', 'learning_rate': '4.997e-05', 'epoch': '0.1797', 'num_input_tokens_seen': 14613533, 'train_runtime': '7392', 'train_tokens_per_second': '1977'} +{'loss': '0.3651', 'grad_norm': '1.173', 'learning_rate': '4.997e-05', 'epoch': '0.1798', 'num_input_tokens_seen': 14615580, 'train_runtime': '7393', 'train_tokens_per_second': '1977'} +{'loss': '0.8372', 'grad_norm': '1.407', 'learning_rate': '4.997e-05', 'epoch': '0.1798', 'num_input_tokens_seen': 14617627, 'train_runtime': '7394', 'train_tokens_per_second': '1977'} +{'loss': '1.053', 'grad_norm': '1.754', 'learning_rate': '4.997e-05', 'epoch': '0.1798', 'num_input_tokens_seen': 14619674, 'train_runtime': '7395', 'train_tokens_per_second': '1977'} +{'loss': '1.295', 'grad_norm': '1.73', 'learning_rate': '4.997e-05', 'epoch': '0.1798', 'num_input_tokens_seen': 14621721, 'train_runtime': '7396', 'train_tokens_per_second': '1977'} +{'loss': '0.8097', 'grad_norm': '1.268', 'learning_rate': '4.997e-05', 'epoch': '0.1799', 'num_input_tokens_seen': 14623768, 'train_runtime': '7397', 'train_tokens_per_second': '1977'} +{'loss': '0.3791', 'grad_norm': '1.109', 'learning_rate': '4.997e-05', 'epoch': '0.1799', 'num_input_tokens_seen': 14625815, 'train_runtime': '7398', 'train_tokens_per_second': '1977'} +{'loss': '0.8739', 'grad_norm': '1.487', 'learning_rate': '4.997e-05', 'epoch': '0.1799', 'num_input_tokens_seen': 14627862, 'train_runtime': '7400', 'train_tokens_per_second': '1977'} +{'loss': '0.3976', 'grad_norm': '1.041', 'learning_rate': '4.997e-05', 'epoch': '0.1799', 'num_input_tokens_seen': 14629909, 'train_runtime': '7401', 'train_tokens_per_second': '1977'} +{'loss': '0.4068', 'grad_norm': '0.9468', 'learning_rate': '4.997e-05', 'epoch': '0.18', 'num_input_tokens_seen': 14631956, 'train_runtime': '7402', 'train_tokens_per_second': '1977'} +{'loss': '0.5265', 'grad_norm': '1.282', 'learning_rate': '4.997e-05', 'epoch': '0.18', 'num_input_tokens_seen': 14634003, 'train_runtime': '7403', 'train_tokens_per_second': '1977'} +{'loss': '1.701', 'grad_norm': '2.717', 'learning_rate': '4.997e-05', 'epoch': '0.18', 'num_input_tokens_seen': 14636050, 'train_runtime': '7404', 'train_tokens_per_second': '1977'} +{'loss': '0.8177', 'grad_norm': '1.241', 'learning_rate': '4.997e-05', 'epoch': '0.18', 'num_input_tokens_seen': 14638097, 'train_runtime': '7405', 'train_tokens_per_second': '1977'} +{'loss': '0.5296', 'grad_norm': '1.362', 'learning_rate': '4.997e-05', 'epoch': '0.1801', 'num_input_tokens_seen': 14640144, 'train_runtime': '7406', 'train_tokens_per_second': '1977'} +{'loss': '0.7968', 'grad_norm': '1.363', 'learning_rate': '4.997e-05', 'epoch': '0.1801', 'num_input_tokens_seen': 14642191, 'train_runtime': '7407', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '1.81', 'learning_rate': '4.997e-05', 'epoch': '0.1801', 'num_input_tokens_seen': 14644238, 'train_runtime': '7408', 'train_tokens_per_second': '1977'} +{'loss': '0.6236', 'grad_norm': '1.221', 'learning_rate': '4.997e-05', 'epoch': '0.1801', 'num_input_tokens_seen': 14646285, 'train_runtime': '7409', 'train_tokens_per_second': '1977'} +{'loss': '0.4204', 'grad_norm': '1.076', 'learning_rate': '4.997e-05', 'epoch': '0.1802', 'num_input_tokens_seen': 14648332, 'train_runtime': '7410', 'train_tokens_per_second': '1977'} +{'loss': '0.653', 'grad_norm': '1.186', 'learning_rate': '4.997e-05', 'epoch': '0.1802', 'num_input_tokens_seen': 14650379, 'train_runtime': '7411', 'train_tokens_per_second': '1977'} +{'loss': '0.3617', 'grad_norm': '1.075', 'learning_rate': '4.997e-05', 'epoch': '0.1802', 'num_input_tokens_seen': 14652426, 'train_runtime': '7412', 'train_tokens_per_second': '1977'} +{'loss': '0.4921', 'grad_norm': '1.126', 'learning_rate': '4.997e-05', 'epoch': '0.1803', 'num_input_tokens_seen': 14654473, 'train_runtime': '7413', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '1.135', 'learning_rate': '4.997e-05', 'epoch': '0.1803', 'num_input_tokens_seen': 14656520, 'train_runtime': '7414', 'train_tokens_per_second': '1977'} +{'loss': '0.7837', 'grad_norm': '1.477', 'learning_rate': '4.997e-05', 'epoch': '0.1803', 'num_input_tokens_seen': 14658567, 'train_runtime': '7415', 'train_tokens_per_second': '1977'} +{'loss': '0.6941', 'grad_norm': '1.607', 'learning_rate': '4.997e-05', 'epoch': '0.1803', 'num_input_tokens_seen': 14660614, 'train_runtime': '7416', 'train_tokens_per_second': '1977'} +{'loss': '1.198', 'grad_norm': '2.07', 'learning_rate': '4.997e-05', 'epoch': '0.1804', 'num_input_tokens_seen': 14662661, 'train_runtime': '7417', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '2.191', 'learning_rate': '4.997e-05', 'epoch': '0.1804', 'num_input_tokens_seen': 14664708, 'train_runtime': '7418', 'train_tokens_per_second': '1977'} +{'loss': '1.175', 'grad_norm': '2.141', 'learning_rate': '4.997e-05', 'epoch': '0.1804', 'num_input_tokens_seen': 14666755, 'train_runtime': '7419', 'train_tokens_per_second': '1977'} +{'loss': '0.5851', 'grad_norm': '1.24', 'learning_rate': '4.997e-05', 'epoch': '0.1804', 'num_input_tokens_seen': 14668802, 'train_runtime': '7420', 'train_tokens_per_second': '1977'} +{'loss': '2.141', 'grad_norm': '2.974', 'learning_rate': '4.997e-05', 'epoch': '0.1805', 'num_input_tokens_seen': 14670849, 'train_runtime': '7421', 'train_tokens_per_second': '1977'} +{'loss': '1.068', 'grad_norm': '1.728', 'learning_rate': '4.997e-05', 'epoch': '0.1805', 'num_input_tokens_seen': 14672896, 'train_runtime': '7422', 'train_tokens_per_second': '1977'} +{'loss': '0.387', 'grad_norm': '1.221', 'learning_rate': '4.997e-05', 'epoch': '0.1805', 'num_input_tokens_seen': 14674943, 'train_runtime': '7423', 'train_tokens_per_second': '1977'} +{'loss': '0.5644', 'grad_norm': '1.168', 'learning_rate': '4.997e-05', 'epoch': '0.1805', 'num_input_tokens_seen': 14676990, 'train_runtime': '7424', 'train_tokens_per_second': '1977'} +{'loss': '0.9558', 'grad_norm': '1.541', 'learning_rate': '4.997e-05', 'epoch': '0.1806', 'num_input_tokens_seen': 14679037, 'train_runtime': '7425', 'train_tokens_per_second': '1977'} +{'loss': '0.4607', 'grad_norm': '1.285', 'learning_rate': '4.997e-05', 'epoch': '0.1806', 'num_input_tokens_seen': 14681084, 'train_runtime': '7426', 'train_tokens_per_second': '1977'} +{'loss': '1.416', 'grad_norm': '2.261', 'learning_rate': '4.997e-05', 'epoch': '0.1806', 'num_input_tokens_seen': 14683131, 'train_runtime': '7428', 'train_tokens_per_second': '1977'} +{'loss': '0.474', 'grad_norm': '1.069', 'learning_rate': '4.997e-05', 'epoch': '0.1806', 'num_input_tokens_seen': 14685178, 'train_runtime': '7429', 'train_tokens_per_second': '1977'} +{'loss': '0.4732', 'grad_norm': '1.318', 'learning_rate': '4.997e-05', 'epoch': '0.1807', 'num_input_tokens_seen': 14687225, 'train_runtime': '7430', 'train_tokens_per_second': '1977'} +{'loss': '0.6508', 'grad_norm': '1.83', 'learning_rate': '4.997e-05', 'epoch': '0.1807', 'num_input_tokens_seen': 14689272, 'train_runtime': '7431', 'train_tokens_per_second': '1977'} +{'loss': '0.7918', 'grad_norm': '1.052', 'learning_rate': '4.997e-05', 'epoch': '0.1807', 'num_input_tokens_seen': 14691319, 'train_runtime': '7432', 'train_tokens_per_second': '1977'} +{'loss': '0.9007', 'grad_norm': '1.771', 'learning_rate': '4.997e-05', 'epoch': '0.1807', 'num_input_tokens_seen': 14693366, 'train_runtime': '7433', 'train_tokens_per_second': '1977'} +{'loss': '0.8081', 'grad_norm': '1.234', 'learning_rate': '4.997e-05', 'epoch': '0.1808', 'num_input_tokens_seen': 14695413, 'train_runtime': '7434', 'train_tokens_per_second': '1977'} +{'loss': '0.7817', 'grad_norm': '1.348', 'learning_rate': '4.997e-05', 'epoch': '0.1808', 'num_input_tokens_seen': 14697460, 'train_runtime': '7435', 'train_tokens_per_second': '1977'} +{'loss': '0.7275', 'grad_norm': '1.534', 'learning_rate': '4.997e-05', 'epoch': '0.1808', 'num_input_tokens_seen': 14699507, 'train_runtime': '7436', 'train_tokens_per_second': '1977'} +{'loss': '1.189', 'grad_norm': '1.908', 'learning_rate': '4.997e-05', 'epoch': '0.1808', 'num_input_tokens_seen': 14701554, 'train_runtime': '7437', 'train_tokens_per_second': '1977'} +{'loss': '0.3353', 'grad_norm': '0.9488', 'learning_rate': '4.997e-05', 'epoch': '0.1809', 'num_input_tokens_seen': 14703601, 'train_runtime': '7438', 'train_tokens_per_second': '1977'} +{'loss': '0.7861', 'grad_norm': '1.254', 'learning_rate': '4.997e-05', 'epoch': '0.1809', 'num_input_tokens_seen': 14705648, 'train_runtime': '7439', 'train_tokens_per_second': '1977'} +{'loss': '0.3895', 'grad_norm': '0.9647', 'learning_rate': '4.997e-05', 'epoch': '0.1809', 'num_input_tokens_seen': 14707695, 'train_runtime': '7440', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '1.91', 'learning_rate': '4.997e-05', 'epoch': '0.1809', 'num_input_tokens_seen': 14709742, 'train_runtime': '7441', 'train_tokens_per_second': '1977'} +{'loss': '0.5698', 'grad_norm': '1.191', 'learning_rate': '4.997e-05', 'epoch': '0.181', 'num_input_tokens_seen': 14711789, 'train_runtime': '7442', 'train_tokens_per_second': '1977'} +{'loss': '0.8575', 'grad_norm': '1.158', 'learning_rate': '4.997e-05', 'epoch': '0.181', 'num_input_tokens_seen': 14713836, 'train_runtime': '7443', 'train_tokens_per_second': '1977'} +{'loss': '1.221', 'grad_norm': '1.763', 'learning_rate': '4.997e-05', 'epoch': '0.181', 'num_input_tokens_seen': 14715883, 'train_runtime': '7444', 'train_tokens_per_second': '1977'} +{'loss': '1.384', 'grad_norm': '1.757', 'learning_rate': '4.997e-05', 'epoch': '0.181', 'num_input_tokens_seen': 14717930, 'train_runtime': '7445', 'train_tokens_per_second': '1977'} +{'loss': '0.5206', 'grad_norm': '1.113', 'learning_rate': '4.997e-05', 'epoch': '0.1811', 'num_input_tokens_seen': 14719977, 'train_runtime': '7446', 'train_tokens_per_second': '1977'} +{'loss': '2.25', 'grad_norm': '2.3', 'learning_rate': '4.997e-05', 'epoch': '0.1811', 'num_input_tokens_seen': 14722024, 'train_runtime': '7447', 'train_tokens_per_second': '1977'} +{'loss': '0.4806', 'grad_norm': '1.125', 'learning_rate': '4.997e-05', 'epoch': '0.1811', 'num_input_tokens_seen': 14724071, 'train_runtime': '7448', 'train_tokens_per_second': '1977'} +{'loss': '2.834', 'grad_norm': '2.589', 'learning_rate': '4.997e-05', 'epoch': '0.1811', 'num_input_tokens_seen': 14726118, 'train_runtime': '7449', 'train_tokens_per_second': '1977'} +{'loss': '0.9358', 'grad_norm': '1.454', 'learning_rate': '4.997e-05', 'epoch': '0.1812', 'num_input_tokens_seen': 14728165, 'train_runtime': '7450', 'train_tokens_per_second': '1977'} +{'loss': '1.393', 'grad_norm': '2.133', 'learning_rate': '4.997e-05', 'epoch': '0.1812', 'num_input_tokens_seen': 14730212, 'train_runtime': '7451', 'train_tokens_per_second': '1977'} +{'loss': '1.255', 'grad_norm': '1.757', 'learning_rate': '4.997e-05', 'epoch': '0.1812', 'num_input_tokens_seen': 14732259, 'train_runtime': '7452', 'train_tokens_per_second': '1977'} +{'loss': '0.8859', 'grad_norm': '1.401', 'learning_rate': '4.997e-05', 'epoch': '0.1812', 'num_input_tokens_seen': 14734306, 'train_runtime': '7453', 'train_tokens_per_second': '1977'} +{'loss': '0.2202', 'grad_norm': '0.9145', 'learning_rate': '4.997e-05', 'epoch': '0.1813', 'num_input_tokens_seen': 14736353, 'train_runtime': '7454', 'train_tokens_per_second': '1977'} +{'loss': '0.399', 'grad_norm': '1.006', 'learning_rate': '4.997e-05', 'epoch': '0.1813', 'num_input_tokens_seen': 14738400, 'train_runtime': '7455', 'train_tokens_per_second': '1977'} +{'loss': '0.7806', 'grad_norm': '1.244', 'learning_rate': '4.997e-05', 'epoch': '0.1813', 'num_input_tokens_seen': 14740447, 'train_runtime': '7456', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '1.818', 'learning_rate': '4.997e-05', 'epoch': '0.1813', 'num_input_tokens_seen': 14742494, 'train_runtime': '7458', 'train_tokens_per_second': '1977'} +{'loss': '0.3057', 'grad_norm': '1.035', 'learning_rate': '4.997e-05', 'epoch': '0.1814', 'num_input_tokens_seen': 14744541, 'train_runtime': '7459', 'train_tokens_per_second': '1977'} +{'loss': '0.4675', 'grad_norm': '1.239', 'learning_rate': '4.997e-05', 'epoch': '0.1814', 'num_input_tokens_seen': 14746588, 'train_runtime': '7460', 'train_tokens_per_second': '1977'} +{'loss': '2.173', 'grad_norm': '2.571', 'learning_rate': '4.997e-05', 'epoch': '0.1814', 'num_input_tokens_seen': 14748635, 'train_runtime': '7461', 'train_tokens_per_second': '1977'} +{'loss': '0.6133', 'grad_norm': '1.049', 'learning_rate': '4.997e-05', 'epoch': '0.1814', 'num_input_tokens_seen': 14750682, 'train_runtime': '7462', 'train_tokens_per_second': '1977'} +{'loss': '0.9424', 'grad_norm': '1.543', 'learning_rate': '4.997e-05', 'epoch': '0.1815', 'num_input_tokens_seen': 14752729, 'train_runtime': '7463', 'train_tokens_per_second': '1977'} +{'loss': '0.8788', 'grad_norm': '1.453', 'learning_rate': '4.997e-05', 'epoch': '0.1815', 'num_input_tokens_seen': 14754776, 'train_runtime': '7464', 'train_tokens_per_second': '1977'} +{'loss': '0.9085', 'grad_norm': '1.412', 'learning_rate': '4.997e-05', 'epoch': '0.1815', 'num_input_tokens_seen': 14756823, 'train_runtime': '7465', 'train_tokens_per_second': '1977'} +{'loss': '0.8045', 'grad_norm': '1.167', 'learning_rate': '4.997e-05', 'epoch': '0.1815', 'num_input_tokens_seen': 14758870, 'train_runtime': '7466', 'train_tokens_per_second': '1977'} +{'loss': '0.226', 'grad_norm': '0.775', 'learning_rate': '4.997e-05', 'epoch': '0.1816', 'num_input_tokens_seen': 14760917, 'train_runtime': '7467', 'train_tokens_per_second': '1977'} +{'loss': '1.207', 'grad_norm': '1.97', 'learning_rate': '4.997e-05', 'epoch': '0.1816', 'num_input_tokens_seen': 14762964, 'train_runtime': '7468', 'train_tokens_per_second': '1977'} +{'loss': '0.736', 'grad_norm': '0.9411', 'learning_rate': '4.997e-05', 'epoch': '0.1816', 'num_input_tokens_seen': 14765011, 'train_runtime': '7469', 'train_tokens_per_second': '1977'} +{'loss': '0.4723', 'grad_norm': '1.12', 'learning_rate': '4.997e-05', 'epoch': '0.1816', 'num_input_tokens_seen': 14767058, 'train_runtime': '7470', 'train_tokens_per_second': '1977'} +{'loss': '0.4119', 'grad_norm': '1.04', 'learning_rate': '4.997e-05', 'epoch': '0.1817', 'num_input_tokens_seen': 14769105, 'train_runtime': '7471', 'train_tokens_per_second': '1977'} +{'loss': '0.8086', 'grad_norm': '1.77', 'learning_rate': '4.997e-05', 'epoch': '0.1817', 'num_input_tokens_seen': 14771152, 'train_runtime': '7472', 'train_tokens_per_second': '1977'} +{'loss': '1.281', 'grad_norm': '1.888', 'learning_rate': '4.997e-05', 'epoch': '0.1817', 'num_input_tokens_seen': 14773199, 'train_runtime': '7473', 'train_tokens_per_second': '1977'} +{'loss': '0.5943', 'grad_norm': '1.186', 'learning_rate': '4.997e-05', 'epoch': '0.1817', 'num_input_tokens_seen': 14775246, 'train_runtime': '7474', 'train_tokens_per_second': '1977'} +{'loss': '0.4394', 'grad_norm': '1.049', 'learning_rate': '4.997e-05', 'epoch': '0.1818', 'num_input_tokens_seen': 14777293, 'train_runtime': '7475', 'train_tokens_per_second': '1977'} +{'loss': '0.8813', 'grad_norm': '1.419', 'learning_rate': '4.997e-05', 'epoch': '0.1818', 'num_input_tokens_seen': 14779340, 'train_runtime': '7476', 'train_tokens_per_second': '1977'} +{'loss': '1.405', 'grad_norm': '1.985', 'learning_rate': '4.997e-05', 'epoch': '0.1818', 'num_input_tokens_seen': 14781387, 'train_runtime': '7477', 'train_tokens_per_second': '1977'} +{'loss': '0.5311', 'grad_norm': '1.314', 'learning_rate': '4.997e-05', 'epoch': '0.1818', 'num_input_tokens_seen': 14783434, 'train_runtime': '7478', 'train_tokens_per_second': '1977'} +{'loss': '0.6318', 'grad_norm': '1.295', 'learning_rate': '4.997e-05', 'epoch': '0.1819', 'num_input_tokens_seen': 14785481, 'train_runtime': '7479', 'train_tokens_per_second': '1977'} +{'loss': '0.3634', 'grad_norm': '1.008', 'learning_rate': '4.997e-05', 'epoch': '0.1819', 'num_input_tokens_seen': 14787528, 'train_runtime': '7480', 'train_tokens_per_second': '1977'} +{'loss': '0.4132', 'grad_norm': '0.8864', 'learning_rate': '4.997e-05', 'epoch': '0.1819', 'num_input_tokens_seen': 14789575, 'train_runtime': '7481', 'train_tokens_per_second': '1977'} +{'loss': '0.5301', 'grad_norm': '1.187', 'learning_rate': '4.997e-05', 'epoch': '0.1819', 'num_input_tokens_seen': 14791622, 'train_runtime': '7482', 'train_tokens_per_second': '1977'} +{'loss': '0.3642', 'grad_norm': '0.9158', 'learning_rate': '4.997e-05', 'epoch': '0.182', 'num_input_tokens_seen': 14793669, 'train_runtime': '7483', 'train_tokens_per_second': '1977'} +{'loss': '0.884', 'grad_norm': '1.346', 'learning_rate': '4.997e-05', 'epoch': '0.182', 'num_input_tokens_seen': 14795716, 'train_runtime': '7484', 'train_tokens_per_second': '1977'} +{'loss': '0.4206', 'grad_norm': '0.963', 'learning_rate': '4.997e-05', 'epoch': '0.182', 'num_input_tokens_seen': 14797763, 'train_runtime': '7486', 'train_tokens_per_second': '1977'} +{'loss': '0.5948', 'grad_norm': '1.582', 'learning_rate': '4.997e-05', 'epoch': '0.182', 'num_input_tokens_seen': 14799810, 'train_runtime': '7487', 'train_tokens_per_second': '1977'} +{'loss': '0.9262', 'grad_norm': '1.355', 'learning_rate': '4.997e-05', 'epoch': '0.1821', 'num_input_tokens_seen': 14801857, 'train_runtime': '7488', 'train_tokens_per_second': '1977'} +{'loss': '0.85', 'grad_norm': '1.445', 'learning_rate': '4.997e-05', 'epoch': '0.1821', 'num_input_tokens_seen': 14803904, 'train_runtime': '7489', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '1.834', 'learning_rate': '4.997e-05', 'epoch': '0.1821', 'num_input_tokens_seen': 14805951, 'train_runtime': '7490', 'train_tokens_per_second': '1977'} +{'loss': '0.7157', 'grad_norm': '1.554', 'learning_rate': '4.997e-05', 'epoch': '0.1821', 'num_input_tokens_seen': 14807998, 'train_runtime': '7491', 'train_tokens_per_second': '1977'} +{'loss': '0.6161', 'grad_norm': '1.42', 'learning_rate': '4.997e-05', 'epoch': '0.1822', 'num_input_tokens_seen': 14810045, 'train_runtime': '7492', 'train_tokens_per_second': '1977'} +{'loss': '0.6506', 'grad_norm': '0.9998', 'learning_rate': '4.997e-05', 'epoch': '0.1822', 'num_input_tokens_seen': 14812092, 'train_runtime': '7493', 'train_tokens_per_second': '1977'} +{'loss': '0.6151', 'grad_norm': '1.154', 'learning_rate': '4.997e-05', 'epoch': '0.1822', 'num_input_tokens_seen': 14814139, 'train_runtime': '7494', 'train_tokens_per_second': '1977'} +{'loss': '0.9655', 'grad_norm': '1.687', 'learning_rate': '4.997e-05', 'epoch': '0.1822', 'num_input_tokens_seen': 14816186, 'train_runtime': '7495', 'train_tokens_per_second': '1977'} +{'loss': '0.3438', 'grad_norm': '0.8685', 'learning_rate': '4.997e-05', 'epoch': '0.1823', 'num_input_tokens_seen': 14818233, 'train_runtime': '7496', 'train_tokens_per_second': '1977'} +{'loss': '0.6419', 'grad_norm': '1.424', 'learning_rate': '4.997e-05', 'epoch': '0.1823', 'num_input_tokens_seen': 14820280, 'train_runtime': '7497', 'train_tokens_per_second': '1977'} +{'loss': '0.925', 'grad_norm': '1.446', 'learning_rate': '4.997e-05', 'epoch': '0.1823', 'num_input_tokens_seen': 14822327, 'train_runtime': '7498', 'train_tokens_per_second': '1977'} +{'loss': '1.694', 'grad_norm': '2.344', 'learning_rate': '4.997e-05', 'epoch': '0.1823', 'num_input_tokens_seen': 14824374, 'train_runtime': '7499', 'train_tokens_per_second': '1977'} +{'loss': '0.4581', 'grad_norm': '1.151', 'learning_rate': '4.997e-05', 'epoch': '0.1824', 'num_input_tokens_seen': 14826421, 'train_runtime': '7500', 'train_tokens_per_second': '1977'} +{'loss': '0.2993', 'grad_norm': '0.9881', 'learning_rate': '4.997e-05', 'epoch': '0.1824', 'num_input_tokens_seen': 14828468, 'train_runtime': '7501', 'train_tokens_per_second': '1977'} +{'loss': '1.623', 'grad_norm': '2.292', 'learning_rate': '4.997e-05', 'epoch': '0.1824', 'num_input_tokens_seen': 14830515, 'train_runtime': '7502', 'train_tokens_per_second': '1977'} +{'loss': '0.8363', 'grad_norm': '1.388', 'learning_rate': '4.997e-05', 'epoch': '0.1824', 'num_input_tokens_seen': 14832562, 'train_runtime': '7503', 'train_tokens_per_second': '1977'} +{'loss': '0.2925', 'grad_norm': '0.8955', 'learning_rate': '4.997e-05', 'epoch': '0.1825', 'num_input_tokens_seen': 14834609, 'train_runtime': '7504', 'train_tokens_per_second': '1977'} +{'loss': '1.302', 'grad_norm': '1.911', 'learning_rate': '4.997e-05', 'epoch': '0.1825', 'num_input_tokens_seen': 14836656, 'train_runtime': '7505', 'train_tokens_per_second': '1977'} +{'loss': '1.12', 'grad_norm': '1.407', 'learning_rate': '4.997e-05', 'epoch': '0.1825', 'num_input_tokens_seen': 14838703, 'train_runtime': '7506', 'train_tokens_per_second': '1977'} +{'loss': '0.8338', 'grad_norm': '1.748', 'learning_rate': '4.997e-05', 'epoch': '0.1825', 'num_input_tokens_seen': 14840750, 'train_runtime': '7507', 'train_tokens_per_second': '1977'} +{'loss': '2.139', 'grad_norm': '2.894', 'learning_rate': '4.997e-05', 'epoch': '0.1826', 'num_input_tokens_seen': 14842797, 'train_runtime': '7508', 'train_tokens_per_second': '1977'} +{'loss': '0.3297', 'grad_norm': '0.9224', 'learning_rate': '4.996e-05', 'epoch': '0.1826', 'num_input_tokens_seen': 14844844, 'train_runtime': '7509', 'train_tokens_per_second': '1977'} +{'loss': '0.7834', 'grad_norm': '1.649', 'learning_rate': '4.996e-05', 'epoch': '0.1826', 'num_input_tokens_seen': 14846891, 'train_runtime': '7510', 'train_tokens_per_second': '1977'} +{'loss': '1.276', 'grad_norm': '2.094', 'learning_rate': '4.996e-05', 'epoch': '0.1826', 'num_input_tokens_seen': 14848938, 'train_runtime': '7511', 'train_tokens_per_second': '1977'} +{'loss': '0.7936', 'grad_norm': '1.198', 'learning_rate': '4.996e-05', 'epoch': '0.1827', 'num_input_tokens_seen': 14850985, 'train_runtime': '7512', 'train_tokens_per_second': '1977'} +{'loss': '0.304', 'grad_norm': '0.9416', 'learning_rate': '4.996e-05', 'epoch': '0.1827', 'num_input_tokens_seen': 14853032, 'train_runtime': '7513', 'train_tokens_per_second': '1977'} +{'loss': '0.6905', 'grad_norm': '1.031', 'learning_rate': '4.996e-05', 'epoch': '0.1827', 'num_input_tokens_seen': 14855079, 'train_runtime': '7515', 'train_tokens_per_second': '1977'} +{'loss': '1.694', 'grad_norm': '2.157', 'learning_rate': '4.996e-05', 'epoch': '0.1827', 'num_input_tokens_seen': 14857126, 'train_runtime': '7516', 'train_tokens_per_second': '1977'} +{'loss': '2.288', 'grad_norm': '2.74', 'learning_rate': '4.996e-05', 'epoch': '0.1828', 'num_input_tokens_seen': 14859173, 'train_runtime': '7517', 'train_tokens_per_second': '1977'} +{'loss': '0.383', 'grad_norm': '1.136', 'learning_rate': '4.996e-05', 'epoch': '0.1828', 'num_input_tokens_seen': 14861220, 'train_runtime': '7518', 'train_tokens_per_second': '1977'} +{'loss': '0.8631', 'grad_norm': '1.41', 'learning_rate': '4.996e-05', 'epoch': '0.1828', 'num_input_tokens_seen': 14863267, 'train_runtime': '7519', 'train_tokens_per_second': '1977'} +{'loss': '1.085', 'grad_norm': '1.996', 'learning_rate': '4.996e-05', 'epoch': '0.1828', 'num_input_tokens_seen': 14865314, 'train_runtime': '7520', 'train_tokens_per_second': '1977'} +{'loss': '1.359', 'grad_norm': '1.895', 'learning_rate': '4.996e-05', 'epoch': '0.1829', 'num_input_tokens_seen': 14867361, 'train_runtime': '7521', 'train_tokens_per_second': '1977'} +{'loss': '0.9433', 'grad_norm': '1.322', 'learning_rate': '4.996e-05', 'epoch': '0.1829', 'num_input_tokens_seen': 14869408, 'train_runtime': '7522', 'train_tokens_per_second': '1977'} +{'loss': '0.6761', 'grad_norm': '1.247', 'learning_rate': '4.996e-05', 'epoch': '0.1829', 'num_input_tokens_seen': 14871455, 'train_runtime': '7523', 'train_tokens_per_second': '1977'} +{'loss': '0.7715', 'grad_norm': '1.211', 'learning_rate': '4.996e-05', 'epoch': '0.1829', 'num_input_tokens_seen': 14873502, 'train_runtime': '7524', 'train_tokens_per_second': '1977'} +{'loss': '0.2859', 'grad_norm': '0.8568', 'learning_rate': '4.996e-05', 'epoch': '0.183', 'num_input_tokens_seen': 14875549, 'train_runtime': '7525', 'train_tokens_per_second': '1977'} +{'loss': '0.8148', 'grad_norm': '1.809', 'learning_rate': '4.996e-05', 'epoch': '0.183', 'num_input_tokens_seen': 14877596, 'train_runtime': '7526', 'train_tokens_per_second': '1977'} +{'loss': '0.7279', 'grad_norm': '1.167', 'learning_rate': '4.996e-05', 'epoch': '0.183', 'num_input_tokens_seen': 14879643, 'train_runtime': '7527', 'train_tokens_per_second': '1977'} +{'loss': '1.097', 'grad_norm': '1.87', 'learning_rate': '4.996e-05', 'epoch': '0.183', 'num_input_tokens_seen': 14881690, 'train_runtime': '7528', 'train_tokens_per_second': '1977'} +{'loss': '0.7663', 'grad_norm': '1.406', 'learning_rate': '4.996e-05', 'epoch': '0.1831', 'num_input_tokens_seen': 14883737, 'train_runtime': '7529', 'train_tokens_per_second': '1977'} +{'loss': '0.7666', 'grad_norm': '1.327', 'learning_rate': '4.996e-05', 'epoch': '0.1831', 'num_input_tokens_seen': 14885784, 'train_runtime': '7530', 'train_tokens_per_second': '1977'} +{'loss': '0.8117', 'grad_norm': '1.391', 'learning_rate': '4.996e-05', 'epoch': '0.1831', 'num_input_tokens_seen': 14887831, 'train_runtime': '7531', 'train_tokens_per_second': '1977'} +{'loss': '0.9594', 'grad_norm': '1.889', 'learning_rate': '4.996e-05', 'epoch': '0.1831', 'num_input_tokens_seen': 14889878, 'train_runtime': '7532', 'train_tokens_per_second': '1977'} +{'loss': '0.92', 'grad_norm': '1.173', 'learning_rate': '4.996e-05', 'epoch': '0.1832', 'num_input_tokens_seen': 14891925, 'train_runtime': '7533', 'train_tokens_per_second': '1977'} +{'loss': '0.3041', 'grad_norm': '0.9772', 'learning_rate': '4.996e-05', 'epoch': '0.1832', 'num_input_tokens_seen': 14893972, 'train_runtime': '7534', 'train_tokens_per_second': '1977'} +{'loss': '0.8607', 'grad_norm': '1.727', 'learning_rate': '4.996e-05', 'epoch': '0.1832', 'num_input_tokens_seen': 14896019, 'train_runtime': '7535', 'train_tokens_per_second': '1977'} +{'loss': '0.5372', 'grad_norm': '1.36', 'learning_rate': '4.996e-05', 'epoch': '0.1832', 'num_input_tokens_seen': 14898066, 'train_runtime': '7536', 'train_tokens_per_second': '1977'} +{'loss': '1.451', 'grad_norm': '1.802', 'learning_rate': '4.996e-05', 'epoch': '0.1833', 'num_input_tokens_seen': 14900113, 'train_runtime': '7537', 'train_tokens_per_second': '1977'} +{'loss': '0.6185', 'grad_norm': '1.583', 'learning_rate': '4.996e-05', 'epoch': '0.1833', 'num_input_tokens_seen': 14902160, 'train_runtime': '7538', 'train_tokens_per_second': '1977'} +{'loss': '0.4143', 'grad_norm': '0.9608', 'learning_rate': '4.996e-05', 'epoch': '0.1833', 'num_input_tokens_seen': 14904207, 'train_runtime': '7539', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '1.722', 'learning_rate': '4.996e-05', 'epoch': '0.1833', 'num_input_tokens_seen': 14906254, 'train_runtime': '7540', 'train_tokens_per_second': '1977'} +{'loss': '0.9883', 'grad_norm': '1.145', 'learning_rate': '4.996e-05', 'epoch': '0.1834', 'num_input_tokens_seen': 14908301, 'train_runtime': '7541', 'train_tokens_per_second': '1977'} +{'loss': '1.548', 'grad_norm': '2.071', 'learning_rate': '4.996e-05', 'epoch': '0.1834', 'num_input_tokens_seen': 14910348, 'train_runtime': '7543', 'train_tokens_per_second': '1977'} +{'loss': '0.628', 'grad_norm': '1.269', 'learning_rate': '4.996e-05', 'epoch': '0.1834', 'num_input_tokens_seen': 14912395, 'train_runtime': '7544', 'train_tokens_per_second': '1977'} +{'loss': '1.217', 'grad_norm': '1.926', 'learning_rate': '4.996e-05', 'epoch': '0.1834', 'num_input_tokens_seen': 14914442, 'train_runtime': '7545', 'train_tokens_per_second': '1977'} +{'loss': '1.214', 'grad_norm': '1.483', 'learning_rate': '4.996e-05', 'epoch': '0.1835', 'num_input_tokens_seen': 14916489, 'train_runtime': '7546', 'train_tokens_per_second': '1977'} +{'loss': '0.9978', 'grad_norm': '1.459', 'learning_rate': '4.996e-05', 'epoch': '0.1835', 'num_input_tokens_seen': 14918536, 'train_runtime': '7547', 'train_tokens_per_second': '1977'} +{'loss': '2.027', 'grad_norm': '2.401', 'learning_rate': '4.996e-05', 'epoch': '0.1835', 'num_input_tokens_seen': 14920583, 'train_runtime': '7548', 'train_tokens_per_second': '1977'} +{'loss': '0.5075', 'grad_norm': '0.921', 'learning_rate': '4.996e-05', 'epoch': '0.1835', 'num_input_tokens_seen': 14922630, 'train_runtime': '7549', 'train_tokens_per_second': '1977'} +{'loss': '0.6714', 'grad_norm': '1.323', 'learning_rate': '4.996e-05', 'epoch': '0.1836', 'num_input_tokens_seen': 14924677, 'train_runtime': '7550', 'train_tokens_per_second': '1977'} +{'loss': '0.4529', 'grad_norm': '1.219', 'learning_rate': '4.996e-05', 'epoch': '0.1836', 'num_input_tokens_seen': 14926724, 'train_runtime': '7551', 'train_tokens_per_second': '1977'} +{'loss': '0.8309', 'grad_norm': '1.789', 'learning_rate': '4.996e-05', 'epoch': '0.1836', 'num_input_tokens_seen': 14928771, 'train_runtime': '7552', 'train_tokens_per_second': '1977'} +{'loss': '0.3116', 'grad_norm': '0.8091', 'learning_rate': '4.996e-05', 'epoch': '0.1836', 'num_input_tokens_seen': 14930818, 'train_runtime': '7553', 'train_tokens_per_second': '1977'} +{'loss': '1.289', 'grad_norm': '1.527', 'learning_rate': '4.996e-05', 'epoch': '0.1837', 'num_input_tokens_seen': 14932865, 'train_runtime': '7554', 'train_tokens_per_second': '1977'} +{'loss': '0.3554', 'grad_norm': '0.9821', 'learning_rate': '4.996e-05', 'epoch': '0.1837', 'num_input_tokens_seen': 14934912, 'train_runtime': '7555', 'train_tokens_per_second': '1977'} +{'loss': '0.3593', 'grad_norm': '0.7772', 'learning_rate': '4.996e-05', 'epoch': '0.1837', 'num_input_tokens_seen': 14936959, 'train_runtime': '7556', 'train_tokens_per_second': '1977'} +{'loss': '0.3118', 'grad_norm': '0.9668', 'learning_rate': '4.996e-05', 'epoch': '0.1838', 'num_input_tokens_seen': 14939006, 'train_runtime': '7557', 'train_tokens_per_second': '1977'} +{'loss': '0.48', 'grad_norm': '1.167', 'learning_rate': '4.996e-05', 'epoch': '0.1838', 'num_input_tokens_seen': 14941053, 'train_runtime': '7558', 'train_tokens_per_second': '1977'} +{'loss': '1.141', 'grad_norm': '1.867', 'learning_rate': '4.996e-05', 'epoch': '0.1838', 'num_input_tokens_seen': 14943100, 'train_runtime': '7559', 'train_tokens_per_second': '1977'} +{'loss': '0.6199', 'grad_norm': '1.659', 'learning_rate': '4.996e-05', 'epoch': '0.1838', 'num_input_tokens_seen': 14945147, 'train_runtime': '7560', 'train_tokens_per_second': '1977'} +{'loss': '0.5825', 'grad_norm': '1.134', 'learning_rate': '4.996e-05', 'epoch': '0.1839', 'num_input_tokens_seen': 14947194, 'train_runtime': '7561', 'train_tokens_per_second': '1977'} +{'loss': '0.7762', 'grad_norm': '1.633', 'learning_rate': '4.996e-05', 'epoch': '0.1839', 'num_input_tokens_seen': 14949241, 'train_runtime': '7562', 'train_tokens_per_second': '1977'} +{'loss': '0.4217', 'grad_norm': '1.009', 'learning_rate': '4.996e-05', 'epoch': '0.1839', 'num_input_tokens_seen': 14951288, 'train_runtime': '7563', 'train_tokens_per_second': '1977'} +{'loss': '0.6705', 'grad_norm': '0.9703', 'learning_rate': '4.996e-05', 'epoch': '0.1839', 'num_input_tokens_seen': 14953335, 'train_runtime': '7564', 'train_tokens_per_second': '1977'} +{'loss': '0.3118', 'grad_norm': '1.097', 'learning_rate': '4.996e-05', 'epoch': '0.184', 'num_input_tokens_seen': 14955382, 'train_runtime': '7565', 'train_tokens_per_second': '1977'} +{'loss': '1.602', 'grad_norm': '2.574', 'learning_rate': '4.996e-05', 'epoch': '0.184', 'num_input_tokens_seen': 14957429, 'train_runtime': '7566', 'train_tokens_per_second': '1977'} +{'loss': '0.6231', 'grad_norm': '1.542', 'learning_rate': '4.996e-05', 'epoch': '0.184', 'num_input_tokens_seen': 14959476, 'train_runtime': '7567', 'train_tokens_per_second': '1977'} +{'loss': '0.01654', 'grad_norm': '0.1696', 'learning_rate': '4.996e-05', 'epoch': '0.184', 'num_input_tokens_seen': 14961523, 'train_runtime': '7568', 'train_tokens_per_second': '1977'} +{'loss': '0.378', 'grad_norm': '0.9175', 'learning_rate': '4.996e-05', 'epoch': '0.1841', 'num_input_tokens_seen': 14963570, 'train_runtime': '7569', 'train_tokens_per_second': '1977'} +{'loss': '1.199', 'grad_norm': '1.802', 'learning_rate': '4.996e-05', 'epoch': '0.1841', 'num_input_tokens_seen': 14965617, 'train_runtime': '7570', 'train_tokens_per_second': '1977'} +{'loss': '0.3455', 'grad_norm': '0.9435', 'learning_rate': '4.996e-05', 'epoch': '0.1841', 'num_input_tokens_seen': 14967664, 'train_runtime': '7571', 'train_tokens_per_second': '1977'} +{'loss': '1.739', 'grad_norm': '1.981', 'learning_rate': '4.996e-05', 'epoch': '0.1841', 'num_input_tokens_seen': 14969711, 'train_runtime': '7572', 'train_tokens_per_second': '1977'} +{'loss': '0.3654', 'grad_norm': '0.9075', 'learning_rate': '4.996e-05', 'epoch': '0.1842', 'num_input_tokens_seen': 14971758, 'train_runtime': '7574', 'train_tokens_per_second': '1977'} +{'loss': '0.9404', 'grad_norm': '1.431', 'learning_rate': '4.996e-05', 'epoch': '0.1842', 'num_input_tokens_seen': 14973805, 'train_runtime': '7575', 'train_tokens_per_second': '1977'} +{'loss': '0.5836', 'grad_norm': '1.193', 'learning_rate': '4.996e-05', 'epoch': '0.1842', 'num_input_tokens_seen': 14975852, 'train_runtime': '7576', 'train_tokens_per_second': '1977'} +{'loss': '1.654', 'grad_norm': '2.002', 'learning_rate': '4.996e-05', 'epoch': '0.1842', 'num_input_tokens_seen': 14977899, 'train_runtime': '7577', 'train_tokens_per_second': '1977'} +{'loss': '1.222', 'grad_norm': '1.322', 'learning_rate': '4.996e-05', 'epoch': '0.1843', 'num_input_tokens_seen': 14979946, 'train_runtime': '7578', 'train_tokens_per_second': '1977'} +{'loss': '0.5486', 'grad_norm': '1.142', 'learning_rate': '4.996e-05', 'epoch': '0.1843', 'num_input_tokens_seen': 14981993, 'train_runtime': '7579', 'train_tokens_per_second': '1977'} +{'loss': '0.4481', 'grad_norm': '0.9299', 'learning_rate': '4.996e-05', 'epoch': '0.1843', 'num_input_tokens_seen': 14984040, 'train_runtime': '7580', 'train_tokens_per_second': '1977'} +{'loss': '0.3533', 'grad_norm': '0.9424', 'learning_rate': '4.996e-05', 'epoch': '0.1843', 'num_input_tokens_seen': 14986087, 'train_runtime': '7581', 'train_tokens_per_second': '1977'} +{'loss': '0.8563', 'grad_norm': '1.117', 'learning_rate': '4.996e-05', 'epoch': '0.1844', 'num_input_tokens_seen': 14988134, 'train_runtime': '7582', 'train_tokens_per_second': '1977'} +{'loss': '1.438', 'grad_norm': '2.09', 'learning_rate': '4.996e-05', 'epoch': '0.1844', 'num_input_tokens_seen': 14990181, 'train_runtime': '7583', 'train_tokens_per_second': '1977'} +{'loss': '0.8811', 'grad_norm': '1.578', 'learning_rate': '4.996e-05', 'epoch': '0.1844', 'num_input_tokens_seen': 14992228, 'train_runtime': '7584', 'train_tokens_per_second': '1977'} +{'loss': '0.9999', 'grad_norm': '1.522', 'learning_rate': '4.996e-05', 'epoch': '0.1844', 'num_input_tokens_seen': 14994275, 'train_runtime': '7585', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '1.24', 'learning_rate': '4.996e-05', 'epoch': '0.1845', 'num_input_tokens_seen': 14996322, 'train_runtime': '7586', 'train_tokens_per_second': '1977'} +{'loss': '0.4303', 'grad_norm': '0.9518', 'learning_rate': '4.996e-05', 'epoch': '0.1845', 'num_input_tokens_seen': 14998369, 'train_runtime': '7587', 'train_tokens_per_second': '1977'} +{'loss': '0.5008', 'grad_norm': '1.04', 'learning_rate': '4.996e-05', 'epoch': '0.1845', 'num_input_tokens_seen': 15000416, 'train_runtime': '7588', 'train_tokens_per_second': '1977'} +{'loss': '1.916', 'grad_norm': '2.299', 'learning_rate': '4.996e-05', 'epoch': '0.1845', 'num_input_tokens_seen': 15002463, 'train_runtime': '7589', 'train_tokens_per_second': '1977'} +{'loss': '0.226', 'grad_norm': '0.9759', 'learning_rate': '4.996e-05', 'epoch': '0.1846', 'num_input_tokens_seen': 15004510, 'train_runtime': '7590', 'train_tokens_per_second': '1977'} +{'loss': '1.505', 'grad_norm': '1.663', 'learning_rate': '4.996e-05', 'epoch': '0.1846', 'num_input_tokens_seen': 15006557, 'train_runtime': '7591', 'train_tokens_per_second': '1977'} +{'loss': '0.4985', 'grad_norm': '1.022', 'learning_rate': '4.996e-05', 'epoch': '0.1846', 'num_input_tokens_seen': 15008604, 'train_runtime': '7592', 'train_tokens_per_second': '1977'} +{'loss': '0.9907', 'grad_norm': '1.714', 'learning_rate': '4.996e-05', 'epoch': '0.1846', 'num_input_tokens_seen': 15010651, 'train_runtime': '7593', 'train_tokens_per_second': '1977'} +{'loss': '0.2702', 'grad_norm': '0.879', 'learning_rate': '4.996e-05', 'epoch': '0.1847', 'num_input_tokens_seen': 15012698, 'train_runtime': '7594', 'train_tokens_per_second': '1977'} +{'loss': '0.3733', 'grad_norm': '0.9691', 'learning_rate': '4.996e-05', 'epoch': '0.1847', 'num_input_tokens_seen': 15014745, 'train_runtime': '7595', 'train_tokens_per_second': '1977'} +{'loss': '0.9772', 'grad_norm': '1.452', 'learning_rate': '4.996e-05', 'epoch': '0.1847', 'num_input_tokens_seen': 15016792, 'train_runtime': '7596', 'train_tokens_per_second': '1977'} +{'loss': '0.9618', 'grad_norm': '1.215', 'learning_rate': '4.996e-05', 'epoch': '0.1847', 'num_input_tokens_seen': 15018839, 'train_runtime': '7597', 'train_tokens_per_second': '1977'} +{'loss': '0.429', 'grad_norm': '0.9807', 'learning_rate': '4.996e-05', 'epoch': '0.1848', 'num_input_tokens_seen': 15020886, 'train_runtime': '7598', 'train_tokens_per_second': '1977'} +{'loss': '0.9796', 'grad_norm': '1.315', 'learning_rate': '4.996e-05', 'epoch': '0.1848', 'num_input_tokens_seen': 15022933, 'train_runtime': '7599', 'train_tokens_per_second': '1977'} +{'loss': '0.4666', 'grad_norm': '0.9398', 'learning_rate': '4.996e-05', 'epoch': '0.1848', 'num_input_tokens_seen': 15024980, 'train_runtime': '7600', 'train_tokens_per_second': '1977'} +{'loss': '0.3886', 'grad_norm': '0.8018', 'learning_rate': '4.996e-05', 'epoch': '0.1848', 'num_input_tokens_seen': 15027027, 'train_runtime': '7601', 'train_tokens_per_second': '1977'} +{'loss': '0.5171', 'grad_norm': '1.439', 'learning_rate': '4.996e-05', 'epoch': '0.1849', 'num_input_tokens_seen': 15029074, 'train_runtime': '7602', 'train_tokens_per_second': '1977'} +{'loss': '0.5625', 'grad_norm': '1.003', 'learning_rate': '4.996e-05', 'epoch': '0.1849', 'num_input_tokens_seen': 15031121, 'train_runtime': '7603', 'train_tokens_per_second': '1977'} +{'loss': '0.9823', 'grad_norm': '1.587', 'learning_rate': '4.996e-05', 'epoch': '0.1849', 'num_input_tokens_seen': 15033168, 'train_runtime': '7605', 'train_tokens_per_second': '1977'} +{'loss': '0.7758', 'grad_norm': '1.616', 'learning_rate': '4.996e-05', 'epoch': '0.1849', 'num_input_tokens_seen': 15035215, 'train_runtime': '7606', 'train_tokens_per_second': '1977'} +{'loss': '0.9314', 'grad_norm': '1.505', 'learning_rate': '4.996e-05', 'epoch': '0.185', 'num_input_tokens_seen': 15037262, 'train_runtime': '7607', 'train_tokens_per_second': '1977'} +{'loss': '0.4076', 'grad_norm': '1.163', 'learning_rate': '4.996e-05', 'epoch': '0.185', 'num_input_tokens_seen': 15039309, 'train_runtime': '7608', 'train_tokens_per_second': '1977'} +{'loss': '0.3419', 'grad_norm': '0.9632', 'learning_rate': '4.996e-05', 'epoch': '0.185', 'num_input_tokens_seen': 15041356, 'train_runtime': '7609', 'train_tokens_per_second': '1977'} +{'loss': '0.7201', 'grad_norm': '1.032', 'learning_rate': '4.996e-05', 'epoch': '0.185', 'num_input_tokens_seen': 15043403, 'train_runtime': '7610', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '1.914', 'learning_rate': '4.996e-05', 'epoch': '0.1851', 'num_input_tokens_seen': 15045450, 'train_runtime': '7611', 'train_tokens_per_second': '1977'} +{'loss': '0.5446', 'grad_norm': '1.404', 'learning_rate': '4.996e-05', 'epoch': '0.1851', 'num_input_tokens_seen': 15047497, 'train_runtime': '7612', 'train_tokens_per_second': '1977'} +{'loss': '0.7027', 'grad_norm': '1.6', 'learning_rate': '4.996e-05', 'epoch': '0.1851', 'num_input_tokens_seen': 15049544, 'train_runtime': '7613', 'train_tokens_per_second': '1977'} +{'loss': '1.041', 'grad_norm': '2.018', 'learning_rate': '4.996e-05', 'epoch': '0.1851', 'num_input_tokens_seen': 15051591, 'train_runtime': '7614', 'train_tokens_per_second': '1977'} +{'loss': '0.6788', 'grad_norm': '1.625', 'learning_rate': '4.996e-05', 'epoch': '0.1852', 'num_input_tokens_seen': 15053638, 'train_runtime': '7615', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '1.274', 'learning_rate': '4.996e-05', 'epoch': '0.1852', 'num_input_tokens_seen': 15055685, 'train_runtime': '7616', 'train_tokens_per_second': '1977'} +{'loss': '0.6908', 'grad_norm': '1.346', 'learning_rate': '4.996e-05', 'epoch': '0.1852', 'num_input_tokens_seen': 15057732, 'train_runtime': '7617', 'train_tokens_per_second': '1977'} +{'loss': '0.9257', 'grad_norm': '2.084', 'learning_rate': '4.996e-05', 'epoch': '0.1852', 'num_input_tokens_seen': 15059779, 'train_runtime': '7618', 'train_tokens_per_second': '1977'} +{'loss': '0.8208', 'grad_norm': '1.481', 'learning_rate': '4.996e-05', 'epoch': '0.1853', 'num_input_tokens_seen': 15061826, 'train_runtime': '7619', 'train_tokens_per_second': '1977'} +{'loss': '0.3017', 'grad_norm': '0.876', 'learning_rate': '4.996e-05', 'epoch': '0.1853', 'num_input_tokens_seen': 15063873, 'train_runtime': '7620', 'train_tokens_per_second': '1977'} +{'loss': '0.9068', 'grad_norm': '1.491', 'learning_rate': '4.996e-05', 'epoch': '0.1853', 'num_input_tokens_seen': 15065920, 'train_runtime': '7621', 'train_tokens_per_second': '1977'} +{'loss': '0.9453', 'grad_norm': '1.437', 'learning_rate': '4.996e-05', 'epoch': '0.1853', 'num_input_tokens_seen': 15067967, 'train_runtime': '7622', 'train_tokens_per_second': '1977'} +{'loss': '0.8034', 'grad_norm': '1.568', 'learning_rate': '4.996e-05', 'epoch': '0.1854', 'num_input_tokens_seen': 15070014, 'train_runtime': '7623', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '2.214', 'learning_rate': '4.996e-05', 'epoch': '0.1854', 'num_input_tokens_seen': 15072061, 'train_runtime': '7624', 'train_tokens_per_second': '1977'} +{'loss': '0.9669', 'grad_norm': '1.726', 'learning_rate': '4.996e-05', 'epoch': '0.1854', 'num_input_tokens_seen': 15074108, 'train_runtime': '7625', 'train_tokens_per_second': '1977'} +{'loss': '0.8195', 'grad_norm': '1.767', 'learning_rate': '4.996e-05', 'epoch': '0.1854', 'num_input_tokens_seen': 15076155, 'train_runtime': '7626', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '1.586', 'learning_rate': '4.996e-05', 'epoch': '0.1855', 'num_input_tokens_seen': 15078202, 'train_runtime': '7627', 'train_tokens_per_second': '1977'} +{'loss': '1.269', 'grad_norm': '2', 'learning_rate': '4.996e-05', 'epoch': '0.1855', 'num_input_tokens_seen': 15080249, 'train_runtime': '7628', 'train_tokens_per_second': '1977'} +{'loss': '0.4374', 'grad_norm': '1.173', 'learning_rate': '4.996e-05', 'epoch': '0.1855', 'num_input_tokens_seen': 15082296, 'train_runtime': '7629', 'train_tokens_per_second': '1977'} +{'loss': '0.803', 'grad_norm': '1.466', 'learning_rate': '4.996e-05', 'epoch': '0.1855', 'num_input_tokens_seen': 15084343, 'train_runtime': '7630', 'train_tokens_per_second': '1977'} +{'loss': '1.172', 'grad_norm': '1.673', 'learning_rate': '4.996e-05', 'epoch': '0.1856', 'num_input_tokens_seen': 15086390, 'train_runtime': '7631', 'train_tokens_per_second': '1977'} +{'loss': '0.3793', 'grad_norm': '0.9412', 'learning_rate': '4.996e-05', 'epoch': '0.1856', 'num_input_tokens_seen': 15088437, 'train_runtime': '7632', 'train_tokens_per_second': '1977'} +{'loss': '1.944', 'grad_norm': '2.786', 'learning_rate': '4.996e-05', 'epoch': '0.1856', 'num_input_tokens_seen': 15090484, 'train_runtime': '7633', 'train_tokens_per_second': '1977'} +{'loss': '0.5313', 'grad_norm': '1.022', 'learning_rate': '4.996e-05', 'epoch': '0.1856', 'num_input_tokens_seen': 15092531, 'train_runtime': '7635', 'train_tokens_per_second': '1977'} +{'loss': '0.2582', 'grad_norm': '0.961', 'learning_rate': '4.996e-05', 'epoch': '0.1857', 'num_input_tokens_seen': 15094578, 'train_runtime': '7636', 'train_tokens_per_second': '1977'} +{'loss': '0.8786', 'grad_norm': '1.463', 'learning_rate': '4.996e-05', 'epoch': '0.1857', 'num_input_tokens_seen': 15096625, 'train_runtime': '7637', 'train_tokens_per_second': '1977'} +{'loss': '1.393', 'grad_norm': '2.112', 'learning_rate': '4.996e-05', 'epoch': '0.1857', 'num_input_tokens_seen': 15098672, 'train_runtime': '7638', 'train_tokens_per_second': '1977'} +{'loss': '0.4303', 'grad_norm': '0.9577', 'learning_rate': '4.996e-05', 'epoch': '0.1857', 'num_input_tokens_seen': 15100719, 'train_runtime': '7639', 'train_tokens_per_second': '1977'} +{'loss': '0.2955', 'grad_norm': '1.013', 'learning_rate': '4.996e-05', 'epoch': '0.1858', 'num_input_tokens_seen': 15102766, 'train_runtime': '7640', 'train_tokens_per_second': '1977'} +{'loss': '1.655', 'grad_norm': '2.387', 'learning_rate': '4.996e-05', 'epoch': '0.1858', 'num_input_tokens_seen': 15104813, 'train_runtime': '7641', 'train_tokens_per_second': '1977'} +{'loss': '0.3235', 'grad_norm': '1.145', 'learning_rate': '4.996e-05', 'epoch': '0.1858', 'num_input_tokens_seen': 15106860, 'train_runtime': '7642', 'train_tokens_per_second': '1977'} +{'loss': '0.7839', 'grad_norm': '1.12', 'learning_rate': '4.996e-05', 'epoch': '0.1858', 'num_input_tokens_seen': 15108907, 'train_runtime': '7643', 'train_tokens_per_second': '1977'} +{'loss': '0.3335', 'grad_norm': '0.8519', 'learning_rate': '4.996e-05', 'epoch': '0.1859', 'num_input_tokens_seen': 15110954, 'train_runtime': '7644', 'train_tokens_per_second': '1977'} +{'loss': '0.9232', 'grad_norm': '1.516', 'learning_rate': '4.996e-05', 'epoch': '0.1859', 'num_input_tokens_seen': 15113001, 'train_runtime': '7645', 'train_tokens_per_second': '1977'} +{'loss': '0.6435', 'grad_norm': '1.584', 'learning_rate': '4.996e-05', 'epoch': '0.1859', 'num_input_tokens_seen': 15115048, 'train_runtime': '7646', 'train_tokens_per_second': '1977'} +{'loss': '0.8993', 'grad_norm': '1.441', 'learning_rate': '4.996e-05', 'epoch': '0.1859', 'num_input_tokens_seen': 15117095, 'train_runtime': '7647', 'train_tokens_per_second': '1977'} +{'loss': '0.7793', 'grad_norm': '1.254', 'learning_rate': '4.996e-05', 'epoch': '0.186', 'num_input_tokens_seen': 15119142, 'train_runtime': '7648', 'train_tokens_per_second': '1977'} +{'loss': '0.3792', 'grad_norm': '1.136', 'learning_rate': '4.996e-05', 'epoch': '0.186', 'num_input_tokens_seen': 15121189, 'train_runtime': '7649', 'train_tokens_per_second': '1977'} +{'loss': '0.8612', 'grad_norm': '1.616', 'learning_rate': '4.996e-05', 'epoch': '0.186', 'num_input_tokens_seen': 15123236, 'train_runtime': '7650', 'train_tokens_per_second': '1977'} +{'loss': '0.7164', 'grad_norm': '1.378', 'learning_rate': '4.996e-05', 'epoch': '0.186', 'num_input_tokens_seen': 15125283, 'train_runtime': '7651', 'train_tokens_per_second': '1977'} +{'loss': '0.777', 'grad_norm': '1.562', 'learning_rate': '4.996e-05', 'epoch': '0.1861', 'num_input_tokens_seen': 15127330, 'train_runtime': '7652', 'train_tokens_per_second': '1977'} +{'loss': '0.7724', 'grad_norm': '1.372', 'learning_rate': '4.996e-05', 'epoch': '0.1861', 'num_input_tokens_seen': 15129377, 'train_runtime': '7653', 'train_tokens_per_second': '1977'} +{'loss': '0.3846', 'grad_norm': '0.9695', 'learning_rate': '4.996e-05', 'epoch': '0.1861', 'num_input_tokens_seen': 15131424, 'train_runtime': '7654', 'train_tokens_per_second': '1977'} +{'loss': '0.2326', 'grad_norm': '0.8166', 'learning_rate': '4.996e-05', 'epoch': '0.1861', 'num_input_tokens_seen': 15133471, 'train_runtime': '7655', 'train_tokens_per_second': '1977'} +{'loss': '0.6705', 'grad_norm': '1.587', 'learning_rate': '4.996e-05', 'epoch': '0.1862', 'num_input_tokens_seen': 15135518, 'train_runtime': '7656', 'train_tokens_per_second': '1977'} +{'loss': '0.44', 'grad_norm': '0.88', 'learning_rate': '4.996e-05', 'epoch': '0.1862', 'num_input_tokens_seen': 15137565, 'train_runtime': '7657', 'train_tokens_per_second': '1977'} +{'loss': '0.412', 'grad_norm': '0.8063', 'learning_rate': '4.996e-05', 'epoch': '0.1862', 'num_input_tokens_seen': 15139612, 'train_runtime': '7658', 'train_tokens_per_second': '1977'} +{'loss': '1.863', 'grad_norm': '2.598', 'learning_rate': '4.996e-05', 'epoch': '0.1862', 'num_input_tokens_seen': 15141659, 'train_runtime': '7659', 'train_tokens_per_second': '1977'} +{'loss': '2.465', 'grad_norm': '2.22', 'learning_rate': '4.996e-05', 'epoch': '0.1863', 'num_input_tokens_seen': 15143706, 'train_runtime': '7660', 'train_tokens_per_second': '1977'} +{'loss': '0.6346', 'grad_norm': '1.472', 'learning_rate': '4.996e-05', 'epoch': '0.1863', 'num_input_tokens_seen': 15145753, 'train_runtime': '7661', 'train_tokens_per_second': '1977'} +{'loss': '0.5791', 'grad_norm': '1.018', 'learning_rate': '4.996e-05', 'epoch': '0.1863', 'num_input_tokens_seen': 15147800, 'train_runtime': '7662', 'train_tokens_per_second': '1977'} +{'loss': '0.9999', 'grad_norm': '1.421', 'learning_rate': '4.996e-05', 'epoch': '0.1863', 'num_input_tokens_seen': 15149847, 'train_runtime': '7663', 'train_tokens_per_second': '1977'} +{'loss': '1.329', 'grad_norm': '2.386', 'learning_rate': '4.996e-05', 'epoch': '0.1864', 'num_input_tokens_seen': 15151894, 'train_runtime': '7664', 'train_tokens_per_second': '1977'} +{'loss': '2.293', 'grad_norm': '2.587', 'learning_rate': '4.996e-05', 'epoch': '0.1864', 'num_input_tokens_seen': 15153941, 'train_runtime': '7666', 'train_tokens_per_second': '1977'} +{'loss': '0.2786', 'grad_norm': '0.8322', 'learning_rate': '4.996e-05', 'epoch': '0.1864', 'num_input_tokens_seen': 15155988, 'train_runtime': '7667', 'train_tokens_per_second': '1977'} +{'loss': '0.4122', 'grad_norm': '1.015', 'learning_rate': '4.996e-05', 'epoch': '0.1864', 'num_input_tokens_seen': 15158035, 'train_runtime': '7668', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '1.628', 'learning_rate': '4.996e-05', 'epoch': '0.1865', 'num_input_tokens_seen': 15160082, 'train_runtime': '7669', 'train_tokens_per_second': '1977'} +{'loss': '1.737', 'grad_norm': '2.244', 'learning_rate': '4.996e-05', 'epoch': '0.1865', 'num_input_tokens_seen': 15162129, 'train_runtime': '7670', 'train_tokens_per_second': '1977'} +{'loss': '1.494', 'grad_norm': '1.932', 'learning_rate': '4.996e-05', 'epoch': '0.1865', 'num_input_tokens_seen': 15164176, 'train_runtime': '7671', 'train_tokens_per_second': '1977'} +{'loss': '0.2838', 'grad_norm': '0.8065', 'learning_rate': '4.996e-05', 'epoch': '0.1865', 'num_input_tokens_seen': 15166223, 'train_runtime': '7672', 'train_tokens_per_second': '1977'} +{'loss': '0.3598', 'grad_norm': '1.002', 'learning_rate': '4.996e-05', 'epoch': '0.1866', 'num_input_tokens_seen': 15168270, 'train_runtime': '7673', 'train_tokens_per_second': '1977'} +{'loss': '1.042', 'grad_norm': '1.126', 'learning_rate': '4.996e-05', 'epoch': '0.1866', 'num_input_tokens_seen': 15170317, 'train_runtime': '7674', 'train_tokens_per_second': '1977'} +{'loss': '1.24', 'grad_norm': '1.415', 'learning_rate': '4.996e-05', 'epoch': '0.1866', 'num_input_tokens_seen': 15172364, 'train_runtime': '7675', 'train_tokens_per_second': '1977'} +{'loss': '0.4411', 'grad_norm': '1.165', 'learning_rate': '4.996e-05', 'epoch': '0.1866', 'num_input_tokens_seen': 15174411, 'train_runtime': '7676', 'train_tokens_per_second': '1977'} +{'loss': '0.731', 'grad_norm': '1.316', 'learning_rate': '4.996e-05', 'epoch': '0.1867', 'num_input_tokens_seen': 15176458, 'train_runtime': '7677', 'train_tokens_per_second': '1977'} +{'loss': '2.043', 'grad_norm': '2.643', 'learning_rate': '4.996e-05', 'epoch': '0.1867', 'num_input_tokens_seen': 15178505, 'train_runtime': '7678', 'train_tokens_per_second': '1977'} +{'loss': '1.147', 'grad_norm': '2.125', 'learning_rate': '4.996e-05', 'epoch': '0.1867', 'num_input_tokens_seen': 15180552, 'train_runtime': '7679', 'train_tokens_per_second': '1977'} +{'loss': '0.7769', 'grad_norm': '1.347', 'learning_rate': '4.996e-05', 'epoch': '0.1867', 'num_input_tokens_seen': 15182599, 'train_runtime': '7680', 'train_tokens_per_second': '1977'} +{'loss': '0.7461', 'grad_norm': '1.165', 'learning_rate': '4.996e-05', 'epoch': '0.1868', 'num_input_tokens_seen': 15184646, 'train_runtime': '7681', 'train_tokens_per_second': '1977'} +{'loss': '0.5608', 'grad_norm': '1.103', 'learning_rate': '4.996e-05', 'epoch': '0.1868', 'num_input_tokens_seen': 15186693, 'train_runtime': '7682', 'train_tokens_per_second': '1977'} +{'loss': '0.3805', 'grad_norm': '0.8135', 'learning_rate': '4.996e-05', 'epoch': '0.1868', 'num_input_tokens_seen': 15188740, 'train_runtime': '7683', 'train_tokens_per_second': '1977'} +{'loss': '0.294', 'grad_norm': '1.101', 'learning_rate': '4.996e-05', 'epoch': '0.1868', 'num_input_tokens_seen': 15190787, 'train_runtime': '7684', 'train_tokens_per_second': '1977'} +{'loss': '0.1931', 'grad_norm': '0.8302', 'learning_rate': '4.996e-05', 'epoch': '0.1869', 'num_input_tokens_seen': 15192834, 'train_runtime': '7685', 'train_tokens_per_second': '1977'} +{'loss': '0.997', 'grad_norm': '1.707', 'learning_rate': '4.996e-05', 'epoch': '0.1869', 'num_input_tokens_seen': 15194881, 'train_runtime': '7686', 'train_tokens_per_second': '1977'} +{'loss': '0.898', 'grad_norm': '1.355', 'learning_rate': '4.996e-05', 'epoch': '0.1869', 'num_input_tokens_seen': 15196928, 'train_runtime': '7687', 'train_tokens_per_second': '1977'} +{'loss': '1.342', 'grad_norm': '2.254', 'learning_rate': '4.996e-05', 'epoch': '0.1869', 'num_input_tokens_seen': 15198975, 'train_runtime': '7688', 'train_tokens_per_second': '1977'} +{'loss': '0.569', 'grad_norm': '1.34', 'learning_rate': '4.996e-05', 'epoch': '0.187', 'num_input_tokens_seen': 15201022, 'train_runtime': '7689', 'train_tokens_per_second': '1977'} +{'loss': '0.9452', 'grad_norm': '1.545', 'learning_rate': '4.996e-05', 'epoch': '0.187', 'num_input_tokens_seen': 15203069, 'train_runtime': '7690', 'train_tokens_per_second': '1977'} +{'loss': '0.5959', 'grad_norm': '1.429', 'learning_rate': '4.996e-05', 'epoch': '0.187', 'num_input_tokens_seen': 15205116, 'train_runtime': '7691', 'train_tokens_per_second': '1977'} +{'loss': '0.2675', 'grad_norm': '0.9901', 'learning_rate': '4.996e-05', 'epoch': '0.187', 'num_input_tokens_seen': 15207163, 'train_runtime': '7692', 'train_tokens_per_second': '1977'} +{'loss': '0.3808', 'grad_norm': '1.056', 'learning_rate': '4.996e-05', 'epoch': '0.1871', 'num_input_tokens_seen': 15209210, 'train_runtime': '7693', 'train_tokens_per_second': '1977'} +{'loss': '0.538', 'grad_norm': '1.23', 'learning_rate': '4.996e-05', 'epoch': '0.1871', 'num_input_tokens_seen': 15211257, 'train_runtime': '7694', 'train_tokens_per_second': '1977'} +{'loss': '0.7633', 'grad_norm': '1.56', 'learning_rate': '4.996e-05', 'epoch': '0.1871', 'num_input_tokens_seen': 15213304, 'train_runtime': '7695', 'train_tokens_per_second': '1977'} +{'loss': '0.4408', 'grad_norm': '1.359', 'learning_rate': '4.996e-05', 'epoch': '0.1871', 'num_input_tokens_seen': 15215351, 'train_runtime': '7697', 'train_tokens_per_second': '1977'} +{'loss': '1.024', 'grad_norm': '1.56', 'learning_rate': '4.996e-05', 'epoch': '0.1872', 'num_input_tokens_seen': 15217398, 'train_runtime': '7698', 'train_tokens_per_second': '1977'} +{'loss': '1.448', 'grad_norm': '2.266', 'learning_rate': '4.996e-05', 'epoch': '0.1872', 'num_input_tokens_seen': 15219445, 'train_runtime': '7699', 'train_tokens_per_second': '1977'} +{'loss': '0.9699', 'grad_norm': '1.933', 'learning_rate': '4.996e-05', 'epoch': '0.1872', 'num_input_tokens_seen': 15221492, 'train_runtime': '7700', 'train_tokens_per_second': '1977'} +{'loss': '0.9002', 'grad_norm': '1.187', 'learning_rate': '4.996e-05', 'epoch': '0.1872', 'num_input_tokens_seen': 15223539, 'train_runtime': '7701', 'train_tokens_per_second': '1977'} +{'loss': '0.9716', 'grad_norm': '1.522', 'learning_rate': '4.996e-05', 'epoch': '0.1873', 'num_input_tokens_seen': 15225586, 'train_runtime': '7702', 'train_tokens_per_second': '1977'} +{'loss': '0.7826', 'grad_norm': '1.118', 'learning_rate': '4.996e-05', 'epoch': '0.1873', 'num_input_tokens_seen': 15227633, 'train_runtime': '7703', 'train_tokens_per_second': '1977'} +{'loss': '0.7405', 'grad_norm': '1.034', 'learning_rate': '4.996e-05', 'epoch': '0.1873', 'num_input_tokens_seen': 15229680, 'train_runtime': '7704', 'train_tokens_per_second': '1977'} +{'loss': '1.48', 'grad_norm': '2.216', 'learning_rate': '4.996e-05', 'epoch': '0.1874', 'num_input_tokens_seen': 15231727, 'train_runtime': '7705', 'train_tokens_per_second': '1977'} +{'loss': '0.4897', 'grad_norm': '0.9997', 'learning_rate': '4.996e-05', 'epoch': '0.1874', 'num_input_tokens_seen': 15233774, 'train_runtime': '7706', 'train_tokens_per_second': '1977'} +{'loss': '1.608', 'grad_norm': '2.123', 'learning_rate': '4.996e-05', 'epoch': '0.1874', 'num_input_tokens_seen': 15235821, 'train_runtime': '7707', 'train_tokens_per_second': '1977'} +{'loss': '1.257', 'grad_norm': '1.751', 'learning_rate': '4.996e-05', 'epoch': '0.1874', 'num_input_tokens_seen': 15237868, 'train_runtime': '7708', 'train_tokens_per_second': '1977'} +{'loss': '0.3796', 'grad_norm': '0.9582', 'learning_rate': '4.996e-05', 'epoch': '0.1875', 'num_input_tokens_seen': 15239915, 'train_runtime': '7709', 'train_tokens_per_second': '1977'} +{'loss': '2.511', 'grad_norm': '4.509', 'learning_rate': '4.996e-05', 'epoch': '0.1875', 'num_input_tokens_seen': 15241962, 'train_runtime': '7710', 'train_tokens_per_second': '1977'} +{'loss': '0.4978', 'grad_norm': '1.12', 'learning_rate': '4.996e-05', 'epoch': '0.1875', 'num_input_tokens_seen': 15244009, 'train_runtime': '7711', 'train_tokens_per_second': '1977'} +{'loss': '0.3938', 'grad_norm': '1.181', 'learning_rate': '4.996e-05', 'epoch': '0.1875', 'num_input_tokens_seen': 15246056, 'train_runtime': '7712', 'train_tokens_per_second': '1977'} +{'loss': '1.223', 'grad_norm': '1.618', 'learning_rate': '4.996e-05', 'epoch': '0.1876', 'num_input_tokens_seen': 15248103, 'train_runtime': '7713', 'train_tokens_per_second': '1977'} +{'loss': '0.7483', 'grad_norm': '1.179', 'learning_rate': '4.996e-05', 'epoch': '0.1876', 'num_input_tokens_seen': 15250150, 'train_runtime': '7714', 'train_tokens_per_second': '1977'} +{'loss': '1.019', 'grad_norm': '1.543', 'learning_rate': '4.996e-05', 'epoch': '0.1876', 'num_input_tokens_seen': 15252197, 'train_runtime': '7715', 'train_tokens_per_second': '1977'} +{'loss': '0.7949', 'grad_norm': '1.313', 'learning_rate': '4.996e-05', 'epoch': '0.1876', 'num_input_tokens_seen': 15254244, 'train_runtime': '7716', 'train_tokens_per_second': '1977'} +{'loss': '0.8759', 'grad_norm': '1.171', 'learning_rate': '4.996e-05', 'epoch': '0.1877', 'num_input_tokens_seen': 15256291, 'train_runtime': '7717', 'train_tokens_per_second': '1977'} +{'loss': '0.6275', 'grad_norm': '1.09', 'learning_rate': '4.996e-05', 'epoch': '0.1877', 'num_input_tokens_seen': 15258338, 'train_runtime': '7718', 'train_tokens_per_second': '1977'} +{'loss': '1.498', 'grad_norm': '2.023', 'learning_rate': '4.996e-05', 'epoch': '0.1877', 'num_input_tokens_seen': 15260385, 'train_runtime': '7719', 'train_tokens_per_second': '1977'} +{'loss': '0.349', 'grad_norm': '1.038', 'learning_rate': '4.996e-05', 'epoch': '0.1877', 'num_input_tokens_seen': 15262432, 'train_runtime': '7720', 'train_tokens_per_second': '1977'} +{'loss': '0.8019', 'grad_norm': '1.482', 'learning_rate': '4.996e-05', 'epoch': '0.1878', 'num_input_tokens_seen': 15264479, 'train_runtime': '7721', 'train_tokens_per_second': '1977'} +{'loss': '0.6588', 'grad_norm': '1.311', 'learning_rate': '4.996e-05', 'epoch': '0.1878', 'num_input_tokens_seen': 15266526, 'train_runtime': '7722', 'train_tokens_per_second': '1977'} +{'loss': '0.3627', 'grad_norm': '0.9274', 'learning_rate': '4.996e-05', 'epoch': '0.1878', 'num_input_tokens_seen': 15268573, 'train_runtime': '7723', 'train_tokens_per_second': '1977'} +{'loss': '0.7285', 'grad_norm': '1.735', 'learning_rate': '4.996e-05', 'epoch': '0.1878', 'num_input_tokens_seen': 15270620, 'train_runtime': '7724', 'train_tokens_per_second': '1977'} +{'loss': '0.4748', 'grad_norm': '0.951', 'learning_rate': '4.996e-05', 'epoch': '0.1879', 'num_input_tokens_seen': 15272667, 'train_runtime': '7725', 'train_tokens_per_second': '1977'} +{'loss': '0.5176', 'grad_norm': '1.108', 'learning_rate': '4.996e-05', 'epoch': '0.1879', 'num_input_tokens_seen': 15274714, 'train_runtime': '7726', 'train_tokens_per_second': '1977'} +{'loss': '0.6196', 'grad_norm': '1.31', 'learning_rate': '4.996e-05', 'epoch': '0.1879', 'num_input_tokens_seen': 15276761, 'train_runtime': '7728', 'train_tokens_per_second': '1977'} +{'loss': '0.2695', 'grad_norm': '0.9416', 'learning_rate': '4.996e-05', 'epoch': '0.1879', 'num_input_tokens_seen': 15278808, 'train_runtime': '7729', 'train_tokens_per_second': '1977'} +{'loss': '2.267', 'grad_norm': '2.193', 'learning_rate': '4.996e-05', 'epoch': '0.188', 'num_input_tokens_seen': 15280855, 'train_runtime': '7730', 'train_tokens_per_second': '1977'} +{'loss': '0.8473', 'grad_norm': '1.473', 'learning_rate': '4.996e-05', 'epoch': '0.188', 'num_input_tokens_seen': 15282902, 'train_runtime': '7731', 'train_tokens_per_second': '1977'} +{'loss': '0.7813', 'grad_norm': '1.297', 'learning_rate': '4.996e-05', 'epoch': '0.188', 'num_input_tokens_seen': 15284949, 'train_runtime': '7732', 'train_tokens_per_second': '1977'} +{'loss': '0.7099', 'grad_norm': '1.432', 'learning_rate': '4.996e-05', 'epoch': '0.188', 'num_input_tokens_seen': 15286996, 'train_runtime': '7733', 'train_tokens_per_second': '1977'} +{'loss': '1.497', 'grad_norm': '2.032', 'learning_rate': '4.996e-05', 'epoch': '0.1881', 'num_input_tokens_seen': 15289043, 'train_runtime': '7734', 'train_tokens_per_second': '1977'} +{'loss': '0.3184', 'grad_norm': '0.9883', 'learning_rate': '4.996e-05', 'epoch': '0.1881', 'num_input_tokens_seen': 15291090, 'train_runtime': '7735', 'train_tokens_per_second': '1977'} +{'loss': '0.6258', 'grad_norm': '1.482', 'learning_rate': '4.996e-05', 'epoch': '0.1881', 'num_input_tokens_seen': 15293137, 'train_runtime': '7736', 'train_tokens_per_second': '1977'} +{'loss': '0.6858', 'grad_norm': '1.405', 'learning_rate': '4.996e-05', 'epoch': '0.1881', 'num_input_tokens_seen': 15295184, 'train_runtime': '7737', 'train_tokens_per_second': '1977'} +{'loss': '0.3634', 'grad_norm': '0.9363', 'learning_rate': '4.996e-05', 'epoch': '0.1882', 'num_input_tokens_seen': 15297231, 'train_runtime': '7738', 'train_tokens_per_second': '1977'} +{'loss': '0.339', 'grad_norm': '0.9365', 'learning_rate': '4.996e-05', 'epoch': '0.1882', 'num_input_tokens_seen': 15299278, 'train_runtime': '7739', 'train_tokens_per_second': '1977'} +{'loss': '0.6767', 'grad_norm': '1.289', 'learning_rate': '4.996e-05', 'epoch': '0.1882', 'num_input_tokens_seen': 15301325, 'train_runtime': '7740', 'train_tokens_per_second': '1977'} +{'loss': '0.434', 'grad_norm': '1.311', 'learning_rate': '4.996e-05', 'epoch': '0.1882', 'num_input_tokens_seen': 15303372, 'train_runtime': '7741', 'train_tokens_per_second': '1977'} +{'loss': '1.359', 'grad_norm': '2.037', 'learning_rate': '4.996e-05', 'epoch': '0.1883', 'num_input_tokens_seen': 15305419, 'train_runtime': '7742', 'train_tokens_per_second': '1977'} +{'loss': '0.5917', 'grad_norm': '1.33', 'learning_rate': '4.996e-05', 'epoch': '0.1883', 'num_input_tokens_seen': 15307466, 'train_runtime': '7743', 'train_tokens_per_second': '1977'} +{'loss': '0.6896', 'grad_norm': '1.504', 'learning_rate': '4.996e-05', 'epoch': '0.1883', 'num_input_tokens_seen': 15309513, 'train_runtime': '7744', 'train_tokens_per_second': '1977'} +{'loss': '0.3209', 'grad_norm': '1.284', 'learning_rate': '4.996e-05', 'epoch': '0.1883', 'num_input_tokens_seen': 15311560, 'train_runtime': '7745', 'train_tokens_per_second': '1977'} +{'loss': '0.3512', 'grad_norm': '0.7883', 'learning_rate': '4.996e-05', 'epoch': '0.1884', 'num_input_tokens_seen': 15313607, 'train_runtime': '7746', 'train_tokens_per_second': '1977'} +{'loss': '0.495', 'grad_norm': '1.286', 'learning_rate': '4.996e-05', 'epoch': '0.1884', 'num_input_tokens_seen': 15315654, 'train_runtime': '7747', 'train_tokens_per_second': '1977'} +{'loss': '0.4177', 'grad_norm': '1.097', 'learning_rate': '4.996e-05', 'epoch': '0.1884', 'num_input_tokens_seen': 15317701, 'train_runtime': '7748', 'train_tokens_per_second': '1977'} +{'loss': '0.2709', 'grad_norm': '0.9313', 'learning_rate': '4.996e-05', 'epoch': '0.1884', 'num_input_tokens_seen': 15319748, 'train_runtime': '7749', 'train_tokens_per_second': '1977'} +{'loss': '1.939', 'grad_norm': '2.971', 'learning_rate': '4.996e-05', 'epoch': '0.1885', 'num_input_tokens_seen': 15321795, 'train_runtime': '7750', 'train_tokens_per_second': '1977'} +{'loss': '0.4056', 'grad_norm': '1.008', 'learning_rate': '4.996e-05', 'epoch': '0.1885', 'num_input_tokens_seen': 15323842, 'train_runtime': '7751', 'train_tokens_per_second': '1977'} +{'loss': '0.7562', 'grad_norm': '1.545', 'learning_rate': '4.996e-05', 'epoch': '0.1885', 'num_input_tokens_seen': 15325889, 'train_runtime': '7752', 'train_tokens_per_second': '1977'} +{'loss': '1.325', 'grad_norm': '2.384', 'learning_rate': '4.996e-05', 'epoch': '0.1885', 'num_input_tokens_seen': 15327936, 'train_runtime': '7753', 'train_tokens_per_second': '1977'} +{'loss': '0.7286', 'grad_norm': '1.309', 'learning_rate': '4.996e-05', 'epoch': '0.1886', 'num_input_tokens_seen': 15329983, 'train_runtime': '7754', 'train_tokens_per_second': '1977'} +{'loss': '1.334', 'grad_norm': '2.092', 'learning_rate': '4.996e-05', 'epoch': '0.1886', 'num_input_tokens_seen': 15332030, 'train_runtime': '7755', 'train_tokens_per_second': '1977'} +{'loss': '1.304', 'grad_norm': '1.81', 'learning_rate': '4.996e-05', 'epoch': '0.1886', 'num_input_tokens_seen': 15334077, 'train_runtime': '7756', 'train_tokens_per_second': '1977'} +{'loss': '1.696', 'grad_norm': '2.763', 'learning_rate': '4.996e-05', 'epoch': '0.1886', 'num_input_tokens_seen': 15336124, 'train_runtime': '7757', 'train_tokens_per_second': '1977'} +{'loss': '1.596', 'grad_norm': '2.146', 'learning_rate': '4.996e-05', 'epoch': '0.1887', 'num_input_tokens_seen': 15338171, 'train_runtime': '7759', 'train_tokens_per_second': '1977'} +{'loss': '0.8805', 'grad_norm': '1.615', 'learning_rate': '4.996e-05', 'epoch': '0.1887', 'num_input_tokens_seen': 15340218, 'train_runtime': '7760', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '2.006', 'learning_rate': '4.996e-05', 'epoch': '0.1887', 'num_input_tokens_seen': 15342265, 'train_runtime': '7761', 'train_tokens_per_second': '1977'} +{'loss': '0.4866', 'grad_norm': '1.327', 'learning_rate': '4.996e-05', 'epoch': '0.1887', 'num_input_tokens_seen': 15344312, 'train_runtime': '7762', 'train_tokens_per_second': '1977'} +{'loss': '0.5397', 'grad_norm': '1.059', 'learning_rate': '4.996e-05', 'epoch': '0.1888', 'num_input_tokens_seen': 15346359, 'train_runtime': '7763', 'train_tokens_per_second': '1977'} +{'loss': '0.8273', 'grad_norm': '1.277', 'learning_rate': '4.996e-05', 'epoch': '0.1888', 'num_input_tokens_seen': 15348406, 'train_runtime': '7764', 'train_tokens_per_second': '1977'} +{'loss': '0.389', 'grad_norm': '1.402', 'learning_rate': '4.996e-05', 'epoch': '0.1888', 'num_input_tokens_seen': 15350453, 'train_runtime': '7765', 'train_tokens_per_second': '1977'} +{'loss': '1.225', 'grad_norm': '1.782', 'learning_rate': '4.996e-05', 'epoch': '0.1888', 'num_input_tokens_seen': 15352500, 'train_runtime': '7766', 'train_tokens_per_second': '1977'} +{'loss': '0.6222', 'grad_norm': '1.543', 'learning_rate': '4.996e-05', 'epoch': '0.1889', 'num_input_tokens_seen': 15354547, 'train_runtime': '7767', 'train_tokens_per_second': '1977'} +{'loss': '0.5366', 'grad_norm': '1.132', 'learning_rate': '4.996e-05', 'epoch': '0.1889', 'num_input_tokens_seen': 15356594, 'train_runtime': '7768', 'train_tokens_per_second': '1977'} +{'loss': '0.8708', 'grad_norm': '1.491', 'learning_rate': '4.996e-05', 'epoch': '0.1889', 'num_input_tokens_seen': 15358641, 'train_runtime': '7769', 'train_tokens_per_second': '1977'} +{'loss': '1.086', 'grad_norm': '1.9', 'learning_rate': '4.996e-05', 'epoch': '0.1889', 'num_input_tokens_seen': 15360688, 'train_runtime': '7770', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '1.166', 'learning_rate': '4.996e-05', 'epoch': '0.189', 'num_input_tokens_seen': 15362735, 'train_runtime': '7771', 'train_tokens_per_second': '1977'} +{'loss': '0.2875', 'grad_norm': '0.9252', 'learning_rate': '4.996e-05', 'epoch': '0.189', 'num_input_tokens_seen': 15364782, 'train_runtime': '7772', 'train_tokens_per_second': '1977'} +{'loss': '0.5125', 'grad_norm': '1.228', 'learning_rate': '4.996e-05', 'epoch': '0.189', 'num_input_tokens_seen': 15366829, 'train_runtime': '7773', 'train_tokens_per_second': '1977'} +{'loss': '1.875', 'grad_norm': '2.882', 'learning_rate': '4.996e-05', 'epoch': '0.189', 'num_input_tokens_seen': 15368876, 'train_runtime': '7774', 'train_tokens_per_second': '1977'} +{'loss': '0.8511', 'grad_norm': '1.644', 'learning_rate': '4.996e-05', 'epoch': '0.1891', 'num_input_tokens_seen': 15370923, 'train_runtime': '7775', 'train_tokens_per_second': '1977'} +{'loss': '1.745', 'grad_norm': '3.279', 'learning_rate': '4.996e-05', 'epoch': '0.1891', 'num_input_tokens_seen': 15372970, 'train_runtime': '7776', 'train_tokens_per_second': '1977'} +{'loss': '1.5', 'grad_norm': '2.567', 'learning_rate': '4.996e-05', 'epoch': '0.1891', 'num_input_tokens_seen': 15375017, 'train_runtime': '7777', 'train_tokens_per_second': '1977'} +{'loss': '1.098', 'grad_norm': '1.806', 'learning_rate': '4.996e-05', 'epoch': '0.1891', 'num_input_tokens_seen': 15377064, 'train_runtime': '7778', 'train_tokens_per_second': '1977'} +{'loss': '0.7386', 'grad_norm': '1.404', 'learning_rate': '4.996e-05', 'epoch': '0.1892', 'num_input_tokens_seen': 15379111, 'train_runtime': '7779', 'train_tokens_per_second': '1977'} +{'loss': '0.3002', 'grad_norm': '0.9695', 'learning_rate': '4.996e-05', 'epoch': '0.1892', 'num_input_tokens_seen': 15381158, 'train_runtime': '7780', 'train_tokens_per_second': '1977'} +{'loss': '0.9693', 'grad_norm': '1.464', 'learning_rate': '4.996e-05', 'epoch': '0.1892', 'num_input_tokens_seen': 15383205, 'train_runtime': '7781', 'train_tokens_per_second': '1977'} +{'loss': '0.8528', 'grad_norm': '1.863', 'learning_rate': '4.996e-05', 'epoch': '0.1892', 'num_input_tokens_seen': 15385252, 'train_runtime': '7782', 'train_tokens_per_second': '1977'} +{'loss': '0.3463', 'grad_norm': '1.026', 'learning_rate': '4.996e-05', 'epoch': '0.1893', 'num_input_tokens_seen': 15387299, 'train_runtime': '7783', 'train_tokens_per_second': '1977'} +{'loss': '1.335', 'grad_norm': '2.4', 'learning_rate': '4.996e-05', 'epoch': '0.1893', 'num_input_tokens_seen': 15389346, 'train_runtime': '7784', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '1.536', 'learning_rate': '4.996e-05', 'epoch': '0.1893', 'num_input_tokens_seen': 15391393, 'train_runtime': '7785', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.477', 'learning_rate': '4.996e-05', 'epoch': '0.1893', 'num_input_tokens_seen': 15393440, 'train_runtime': '7786', 'train_tokens_per_second': '1977'} +{'loss': '0.5032', 'grad_norm': '1.325', 'learning_rate': '4.996e-05', 'epoch': '0.1894', 'num_input_tokens_seen': 15395487, 'train_runtime': '7787', 'train_tokens_per_second': '1977'} +{'loss': '0.4206', 'grad_norm': '0.9894', 'learning_rate': '4.996e-05', 'epoch': '0.1894', 'num_input_tokens_seen': 15397534, 'train_runtime': '7788', 'train_tokens_per_second': '1977'} +{'loss': '0.5453', 'grad_norm': '1.182', 'learning_rate': '4.996e-05', 'epoch': '0.1894', 'num_input_tokens_seen': 15399581, 'train_runtime': '7789', 'train_tokens_per_second': '1977'} +{'loss': '0.5798', 'grad_norm': '1.277', 'learning_rate': '4.996e-05', 'epoch': '0.1894', 'num_input_tokens_seen': 15401628, 'train_runtime': '7791', 'train_tokens_per_second': '1977'} +{'loss': '0.4971', 'grad_norm': '1.022', 'learning_rate': '4.996e-05', 'epoch': '0.1895', 'num_input_tokens_seen': 15403675, 'train_runtime': '7792', 'train_tokens_per_second': '1977'} +{'loss': '0.5497', 'grad_norm': '1.147', 'learning_rate': '4.996e-05', 'epoch': '0.1895', 'num_input_tokens_seen': 15405722, 'train_runtime': '7793', 'train_tokens_per_second': '1977'} +{'loss': '1.818', 'grad_norm': '2.119', 'learning_rate': '4.996e-05', 'epoch': '0.1895', 'num_input_tokens_seen': 15407769, 'train_runtime': '7794', 'train_tokens_per_second': '1977'} +{'loss': '0.539', 'grad_norm': '1.092', 'learning_rate': '4.996e-05', 'epoch': '0.1895', 'num_input_tokens_seen': 15409816, 'train_runtime': '7795', 'train_tokens_per_second': '1977'} +{'loss': '0.2894', 'grad_norm': '0.8955', 'learning_rate': '4.996e-05', 'epoch': '0.1896', 'num_input_tokens_seen': 15411863, 'train_runtime': '7796', 'train_tokens_per_second': '1977'} +{'loss': '0.4234', 'grad_norm': '1.014', 'learning_rate': '4.996e-05', 'epoch': '0.1896', 'num_input_tokens_seen': 15413910, 'train_runtime': '7797', 'train_tokens_per_second': '1977'} +{'loss': '1.301', 'grad_norm': '1.747', 'learning_rate': '4.996e-05', 'epoch': '0.1896', 'num_input_tokens_seen': 15415957, 'train_runtime': '7798', 'train_tokens_per_second': '1977'} +{'loss': '0.8832', 'grad_norm': '1.145', 'learning_rate': '4.996e-05', 'epoch': '0.1896', 'num_input_tokens_seen': 15418004, 'train_runtime': '7799', 'train_tokens_per_second': '1977'} +{'loss': '2.152', 'grad_norm': '2.414', 'learning_rate': '4.996e-05', 'epoch': '0.1897', 'num_input_tokens_seen': 15420051, 'train_runtime': '7800', 'train_tokens_per_second': '1977'} +{'loss': '1.167', 'grad_norm': '1.57', 'learning_rate': '4.996e-05', 'epoch': '0.1897', 'num_input_tokens_seen': 15422098, 'train_runtime': '7801', 'train_tokens_per_second': '1977'} +{'loss': '1.57', 'grad_norm': '2.104', 'learning_rate': '4.996e-05', 'epoch': '0.1897', 'num_input_tokens_seen': 15424145, 'train_runtime': '7802', 'train_tokens_per_second': '1977'} +{'loss': '1.773', 'grad_norm': '2.455', 'learning_rate': '4.996e-05', 'epoch': '0.1897', 'num_input_tokens_seen': 15426192, 'train_runtime': '7803', 'train_tokens_per_second': '1977'} +{'loss': '0.6504', 'grad_norm': '1.235', 'learning_rate': '4.996e-05', 'epoch': '0.1898', 'num_input_tokens_seen': 15428239, 'train_runtime': '7804', 'train_tokens_per_second': '1977'} +{'loss': '1.215', 'grad_norm': '1.748', 'learning_rate': '4.996e-05', 'epoch': '0.1898', 'num_input_tokens_seen': 15430286, 'train_runtime': '7805', 'train_tokens_per_second': '1977'} +{'loss': '0.531', 'grad_norm': '1.257', 'learning_rate': '4.996e-05', 'epoch': '0.1898', 'num_input_tokens_seen': 15432333, 'train_runtime': '7806', 'train_tokens_per_second': '1977'} +{'loss': '0.7867', 'grad_norm': '1.298', 'learning_rate': '4.996e-05', 'epoch': '0.1898', 'num_input_tokens_seen': 15434380, 'train_runtime': '7807', 'train_tokens_per_second': '1977'} +{'loss': '0.7353', 'grad_norm': '1.108', 'learning_rate': '4.996e-05', 'epoch': '0.1899', 'num_input_tokens_seen': 15436427, 'train_runtime': '7808', 'train_tokens_per_second': '1977'} +{'loss': '0.7685', 'grad_norm': '1.518', 'learning_rate': '4.996e-05', 'epoch': '0.1899', 'num_input_tokens_seen': 15438474, 'train_runtime': '7809', 'train_tokens_per_second': '1977'} +{'loss': '1.054', 'grad_norm': '1.551', 'learning_rate': '4.996e-05', 'epoch': '0.1899', 'num_input_tokens_seen': 15440521, 'train_runtime': '7810', 'train_tokens_per_second': '1977'} +{'loss': '0.3387', 'grad_norm': '0.9241', 'learning_rate': '4.996e-05', 'epoch': '0.1899', 'num_input_tokens_seen': 15442568, 'train_runtime': '7811', 'train_tokens_per_second': '1977'} +{'loss': '1.361', 'grad_norm': '2.134', 'learning_rate': '4.996e-05', 'epoch': '0.19', 'num_input_tokens_seen': 15444615, 'train_runtime': '7812', 'train_tokens_per_second': '1977'} +{'loss': '0.9366', 'grad_norm': '1.69', 'learning_rate': '4.996e-05', 'epoch': '0.19', 'num_input_tokens_seen': 15446662, 'train_runtime': '7813', 'train_tokens_per_second': '1977'} +{'loss': '0.6642', 'grad_norm': '1.459', 'learning_rate': '4.996e-05', 'epoch': '0.19', 'num_input_tokens_seen': 15448709, 'train_runtime': '7814', 'train_tokens_per_second': '1977'} +{'loss': '1.339', 'grad_norm': '2.223', 'learning_rate': '4.996e-05', 'epoch': '0.19', 'num_input_tokens_seen': 15450756, 'train_runtime': '7815', 'train_tokens_per_second': '1977'} +{'loss': '0.4767', 'grad_norm': '1.184', 'learning_rate': '4.996e-05', 'epoch': '0.1901', 'num_input_tokens_seen': 15452803, 'train_runtime': '7816', 'train_tokens_per_second': '1977'} +{'loss': '0.643', 'grad_norm': '1.145', 'learning_rate': '4.996e-05', 'epoch': '0.1901', 'num_input_tokens_seen': 15454850, 'train_runtime': '7817', 'train_tokens_per_second': '1977'} +{'loss': '0.7147', 'grad_norm': '1.866', 'learning_rate': '4.996e-05', 'epoch': '0.1901', 'num_input_tokens_seen': 15456897, 'train_runtime': '7818', 'train_tokens_per_second': '1977'} +{'loss': '0.564', 'grad_norm': '1.296', 'learning_rate': '4.996e-05', 'epoch': '0.1901', 'num_input_tokens_seen': 15458944, 'train_runtime': '7819', 'train_tokens_per_second': '1977'} +{'loss': '0.4514', 'grad_norm': '0.9298', 'learning_rate': '4.996e-05', 'epoch': '0.1902', 'num_input_tokens_seen': 15460991, 'train_runtime': '7820', 'train_tokens_per_second': '1977'} +{'loss': '0.8381', 'grad_norm': '2.583', 'learning_rate': '4.996e-05', 'epoch': '0.1902', 'num_input_tokens_seen': 15463038, 'train_runtime': '7822', 'train_tokens_per_second': '1977'} +{'loss': '0.5909', 'grad_norm': '1.344', 'learning_rate': '4.996e-05', 'epoch': '0.1902', 'num_input_tokens_seen': 15465085, 'train_runtime': '7823', 'train_tokens_per_second': '1977'} +{'loss': '1.371', 'grad_norm': '1.975', 'learning_rate': '4.996e-05', 'epoch': '0.1902', 'num_input_tokens_seen': 15467132, 'train_runtime': '7824', 'train_tokens_per_second': '1977'} +{'loss': '0.306', 'grad_norm': '0.9294', 'learning_rate': '4.996e-05', 'epoch': '0.1903', 'num_input_tokens_seen': 15469179, 'train_runtime': '7825', 'train_tokens_per_second': '1977'} +{'loss': '0.9744', 'grad_norm': '1.154', 'learning_rate': '4.996e-05', 'epoch': '0.1903', 'num_input_tokens_seen': 15471226, 'train_runtime': '7826', 'train_tokens_per_second': '1977'} +{'loss': '0.4068', 'grad_norm': '1.011', 'learning_rate': '4.996e-05', 'epoch': '0.1903', 'num_input_tokens_seen': 15473273, 'train_runtime': '7827', 'train_tokens_per_second': '1977'} +{'loss': '0.3162', 'grad_norm': '1.068', 'learning_rate': '4.996e-05', 'epoch': '0.1903', 'num_input_tokens_seen': 15475320, 'train_runtime': '7828', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '1.774', 'learning_rate': '4.996e-05', 'epoch': '0.1904', 'num_input_tokens_seen': 15477367, 'train_runtime': '7829', 'train_tokens_per_second': '1977'} +{'loss': '0.4545', 'grad_norm': '0.9431', 'learning_rate': '4.996e-05', 'epoch': '0.1904', 'num_input_tokens_seen': 15479414, 'train_runtime': '7830', 'train_tokens_per_second': '1977'} +{'loss': '0.4907', 'grad_norm': '1.06', 'learning_rate': '4.996e-05', 'epoch': '0.1904', 'num_input_tokens_seen': 15481461, 'train_runtime': '7831', 'train_tokens_per_second': '1977'} +{'loss': '0.4083', 'grad_norm': '0.8763', 'learning_rate': '4.996e-05', 'epoch': '0.1904', 'num_input_tokens_seen': 15483508, 'train_runtime': '7832', 'train_tokens_per_second': '1977'} +{'loss': '0.8413', 'grad_norm': '1.696', 'learning_rate': '4.996e-05', 'epoch': '0.1905', 'num_input_tokens_seen': 15485555, 'train_runtime': '7833', 'train_tokens_per_second': '1977'} +{'loss': '0.345', 'grad_norm': '1.121', 'learning_rate': '4.996e-05', 'epoch': '0.1905', 'num_input_tokens_seen': 15487602, 'train_runtime': '7834', 'train_tokens_per_second': '1977'} +{'loss': '0.3183', 'grad_norm': '0.9232', 'learning_rate': '4.996e-05', 'epoch': '0.1905', 'num_input_tokens_seen': 15489649, 'train_runtime': '7835', 'train_tokens_per_second': '1977'} +{'loss': '0.7306', 'grad_norm': '1.367', 'learning_rate': '4.996e-05', 'epoch': '0.1905', 'num_input_tokens_seen': 15491696, 'train_runtime': '7836', 'train_tokens_per_second': '1977'} +{'loss': '0.5915', 'grad_norm': '1.502', 'learning_rate': '4.996e-05', 'epoch': '0.1906', 'num_input_tokens_seen': 15493743, 'train_runtime': '7837', 'train_tokens_per_second': '1977'} +{'loss': '2.323', 'grad_norm': '2.541', 'learning_rate': '4.996e-05', 'epoch': '0.1906', 'num_input_tokens_seen': 15495790, 'train_runtime': '7838', 'train_tokens_per_second': '1977'} +{'loss': '1.532', 'grad_norm': '2.549', 'learning_rate': '4.996e-05', 'epoch': '0.1906', 'num_input_tokens_seen': 15497837, 'train_runtime': '7839', 'train_tokens_per_second': '1977'} +{'loss': '1.147', 'grad_norm': '2.1', 'learning_rate': '4.996e-05', 'epoch': '0.1906', 'num_input_tokens_seen': 15499884, 'train_runtime': '7840', 'train_tokens_per_second': '1977'} +{'loss': '0.2859', 'grad_norm': '0.9579', 'learning_rate': '4.996e-05', 'epoch': '0.1907', 'num_input_tokens_seen': 15501931, 'train_runtime': '7841', 'train_tokens_per_second': '1977'} +{'loss': '0.6887', 'grad_norm': '1.434', 'learning_rate': '4.996e-05', 'epoch': '0.1907', 'num_input_tokens_seen': 15503978, 'train_runtime': '7842', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '1.765', 'learning_rate': '4.996e-05', 'epoch': '0.1907', 'num_input_tokens_seen': 15506025, 'train_runtime': '7843', 'train_tokens_per_second': '1977'} +{'loss': '0.7261', 'grad_norm': '1.37', 'learning_rate': '4.996e-05', 'epoch': '0.1907', 'num_input_tokens_seen': 15508072, 'train_runtime': '7844', 'train_tokens_per_second': '1977'} +{'loss': '0.7016', 'grad_norm': '1.638', 'learning_rate': '4.996e-05', 'epoch': '0.1908', 'num_input_tokens_seen': 15510119, 'train_runtime': '7845', 'train_tokens_per_second': '1977'} +{'loss': '0.8941', 'grad_norm': '1.671', 'learning_rate': '4.996e-05', 'epoch': '0.1908', 'num_input_tokens_seen': 15512166, 'train_runtime': '7846', 'train_tokens_per_second': '1977'} +{'loss': '0.3112', 'grad_norm': '0.9327', 'learning_rate': '4.996e-05', 'epoch': '0.1908', 'num_input_tokens_seen': 15514213, 'train_runtime': '7847', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '1.803', 'learning_rate': '4.996e-05', 'epoch': '0.1909', 'num_input_tokens_seen': 15516260, 'train_runtime': '7848', 'train_tokens_per_second': '1977'} +{'loss': '0.8414', 'grad_norm': '1.265', 'learning_rate': '4.996e-05', 'epoch': '0.1909', 'num_input_tokens_seen': 15518307, 'train_runtime': '7849', 'train_tokens_per_second': '1977'} +{'loss': '0.6228', 'grad_norm': '1.153', 'learning_rate': '4.996e-05', 'epoch': '0.1909', 'num_input_tokens_seen': 15520354, 'train_runtime': '7850', 'train_tokens_per_second': '1977'} +{'loss': '1.73', 'grad_norm': '2.704', 'learning_rate': '4.996e-05', 'epoch': '0.1909', 'num_input_tokens_seen': 15522401, 'train_runtime': '7851', 'train_tokens_per_second': '1977'} +{'loss': '2.086', 'grad_norm': '2.732', 'learning_rate': '4.996e-05', 'epoch': '0.191', 'num_input_tokens_seen': 15524448, 'train_runtime': '7853', 'train_tokens_per_second': '1977'} +{'loss': '0.7922', 'grad_norm': '1.432', 'learning_rate': '4.996e-05', 'epoch': '0.191', 'num_input_tokens_seen': 15526495, 'train_runtime': '7854', 'train_tokens_per_second': '1977'} +{'loss': '0.6407', 'grad_norm': '1.528', 'learning_rate': '4.996e-05', 'epoch': '0.191', 'num_input_tokens_seen': 15528542, 'train_runtime': '7855', 'train_tokens_per_second': '1977'} +{'loss': '1.582', 'grad_norm': '2.802', 'learning_rate': '4.996e-05', 'epoch': '0.191', 'num_input_tokens_seen': 15530589, 'train_runtime': '7856', 'train_tokens_per_second': '1977'} +{'loss': '0.3454', 'grad_norm': '0.8434', 'learning_rate': '4.996e-05', 'epoch': '0.1911', 'num_input_tokens_seen': 15532636, 'train_runtime': '7857', 'train_tokens_per_second': '1977'} +{'loss': '0.5589', 'grad_norm': '1.067', 'learning_rate': '4.996e-05', 'epoch': '0.1911', 'num_input_tokens_seen': 15534683, 'train_runtime': '7858', 'train_tokens_per_second': '1977'} +{'loss': '1.158', 'grad_norm': '1.405', 'learning_rate': '4.996e-05', 'epoch': '0.1911', 'num_input_tokens_seen': 15536730, 'train_runtime': '7859', 'train_tokens_per_second': '1977'} +{'loss': '0.8412', 'grad_norm': '1.284', 'learning_rate': '4.996e-05', 'epoch': '0.1911', 'num_input_tokens_seen': 15538777, 'train_runtime': '7860', 'train_tokens_per_second': '1977'} +{'loss': '0.2593', 'grad_norm': '0.8056', 'learning_rate': '4.996e-05', 'epoch': '0.1912', 'num_input_tokens_seen': 15540824, 'train_runtime': '7861', 'train_tokens_per_second': '1977'} +{'loss': '0.4313', 'grad_norm': '1.103', 'learning_rate': '4.996e-05', 'epoch': '0.1912', 'num_input_tokens_seen': 15542871, 'train_runtime': '7862', 'train_tokens_per_second': '1977'} +{'loss': '0.8904', 'grad_norm': '1.657', 'learning_rate': '4.996e-05', 'epoch': '0.1912', 'num_input_tokens_seen': 15544918, 'train_runtime': '7863', 'train_tokens_per_second': '1977'} +{'loss': '0.7562', 'grad_norm': '1.175', 'learning_rate': '4.996e-05', 'epoch': '0.1912', 'num_input_tokens_seen': 15546965, 'train_runtime': '7864', 'train_tokens_per_second': '1977'} +{'loss': '1.001', 'grad_norm': '1.531', 'learning_rate': '4.996e-05', 'epoch': '0.1913', 'num_input_tokens_seen': 15549012, 'train_runtime': '7865', 'train_tokens_per_second': '1977'} +{'loss': '0.7176', 'grad_norm': '1.486', 'learning_rate': '4.996e-05', 'epoch': '0.1913', 'num_input_tokens_seen': 15551059, 'train_runtime': '7866', 'train_tokens_per_second': '1977'} +{'loss': '0.8769', 'grad_norm': '1.401', 'learning_rate': '4.996e-05', 'epoch': '0.1913', 'num_input_tokens_seen': 15553106, 'train_runtime': '7867', 'train_tokens_per_second': '1977'} +{'loss': '0.3715', 'grad_norm': '1.079', 'learning_rate': '4.996e-05', 'epoch': '0.1913', 'num_input_tokens_seen': 15555153, 'train_runtime': '7868', 'train_tokens_per_second': '1977'} +{'loss': '0.5956', 'grad_norm': '1.006', 'learning_rate': '4.996e-05', 'epoch': '0.1914', 'num_input_tokens_seen': 15557200, 'train_runtime': '7869', 'train_tokens_per_second': '1977'} +{'loss': '0.6266', 'grad_norm': '1.415', 'learning_rate': '4.996e-05', 'epoch': '0.1914', 'num_input_tokens_seen': 15559247, 'train_runtime': '7870', 'train_tokens_per_second': '1977'} +{'loss': '1.902', 'grad_norm': '2.469', 'learning_rate': '4.996e-05', 'epoch': '0.1914', 'num_input_tokens_seen': 15561294, 'train_runtime': '7871', 'train_tokens_per_second': '1977'} +{'loss': '1.37', 'grad_norm': '2.162', 'learning_rate': '4.996e-05', 'epoch': '0.1914', 'num_input_tokens_seen': 15563341, 'train_runtime': '7872', 'train_tokens_per_second': '1977'} +{'loss': '1.011', 'grad_norm': '1.478', 'learning_rate': '4.996e-05', 'epoch': '0.1915', 'num_input_tokens_seen': 15565388, 'train_runtime': '7873', 'train_tokens_per_second': '1977'} +{'loss': '1.444', 'grad_norm': '2.24', 'learning_rate': '4.996e-05', 'epoch': '0.1915', 'num_input_tokens_seen': 15567435, 'train_runtime': '7874', 'train_tokens_per_second': '1977'} +{'loss': '1.344', 'grad_norm': '1.795', 'learning_rate': '4.996e-05', 'epoch': '0.1915', 'num_input_tokens_seen': 15569482, 'train_runtime': '7875', 'train_tokens_per_second': '1977'} +{'loss': '0.8745', 'grad_norm': '1.237', 'learning_rate': '4.996e-05', 'epoch': '0.1915', 'num_input_tokens_seen': 15571529, 'train_runtime': '7876', 'train_tokens_per_second': '1977'} +{'loss': '0.4664', 'grad_norm': '0.9764', 'learning_rate': '4.996e-05', 'epoch': '0.1916', 'num_input_tokens_seen': 15573576, 'train_runtime': '7877', 'train_tokens_per_second': '1977'} +{'loss': '0.6866', 'grad_norm': '1.377', 'learning_rate': '4.996e-05', 'epoch': '0.1916', 'num_input_tokens_seen': 15575623, 'train_runtime': '7878', 'train_tokens_per_second': '1977'} +{'loss': '0.7652', 'grad_norm': '1.197', 'learning_rate': '4.996e-05', 'epoch': '0.1916', 'num_input_tokens_seen': 15577670, 'train_runtime': '7879', 'train_tokens_per_second': '1977'} +{'loss': '0.8861', 'grad_norm': '1.182', 'learning_rate': '4.996e-05', 'epoch': '0.1916', 'num_input_tokens_seen': 15579717, 'train_runtime': '7880', 'train_tokens_per_second': '1977'} +{'loss': '0.3134', 'grad_norm': '0.9663', 'learning_rate': '4.996e-05', 'epoch': '0.1917', 'num_input_tokens_seen': 15581764, 'train_runtime': '7881', 'train_tokens_per_second': '1977'} +{'loss': '0.3089', 'grad_norm': '0.9669', 'learning_rate': '4.996e-05', 'epoch': '0.1917', 'num_input_tokens_seen': 15583811, 'train_runtime': '7883', 'train_tokens_per_second': '1977'} +{'loss': '0.8937', 'grad_norm': '1.493', 'learning_rate': '4.996e-05', 'epoch': '0.1917', 'num_input_tokens_seen': 15585858, 'train_runtime': '7884', 'train_tokens_per_second': '1977'} +{'loss': '0.7384', 'grad_norm': '1.33', 'learning_rate': '4.996e-05', 'epoch': '0.1917', 'num_input_tokens_seen': 15587905, 'train_runtime': '7885', 'train_tokens_per_second': '1977'} +{'loss': '0.5832', 'grad_norm': '1.327', 'learning_rate': '4.996e-05', 'epoch': '0.1918', 'num_input_tokens_seen': 15589952, 'train_runtime': '7886', 'train_tokens_per_second': '1977'} +{'loss': '0.4645', 'grad_norm': '1.164', 'learning_rate': '4.996e-05', 'epoch': '0.1918', 'num_input_tokens_seen': 15591999, 'train_runtime': '7887', 'train_tokens_per_second': '1977'} +{'loss': '1.261', 'grad_norm': '2.45', 'learning_rate': '4.996e-05', 'epoch': '0.1918', 'num_input_tokens_seen': 15594046, 'train_runtime': '7888', 'train_tokens_per_second': '1977'} +{'loss': '0.3201', 'grad_norm': '1.149', 'learning_rate': '4.996e-05', 'epoch': '0.1918', 'num_input_tokens_seen': 15596093, 'train_runtime': '7889', 'train_tokens_per_second': '1977'} +{'loss': '0.961', 'grad_norm': '1.795', 'learning_rate': '4.996e-05', 'epoch': '0.1919', 'num_input_tokens_seen': 15598140, 'train_runtime': '7890', 'train_tokens_per_second': '1977'} +{'loss': '1.382', 'grad_norm': '1.997', 'learning_rate': '4.996e-05', 'epoch': '0.1919', 'num_input_tokens_seen': 15600187, 'train_runtime': '7891', 'train_tokens_per_second': '1977'} +{'loss': '0.7391', 'grad_norm': '1.55', 'learning_rate': '4.996e-05', 'epoch': '0.1919', 'num_input_tokens_seen': 15602234, 'train_runtime': '7892', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '2.075', 'learning_rate': '4.996e-05', 'epoch': '0.1919', 'num_input_tokens_seen': 15604281, 'train_runtime': '7893', 'train_tokens_per_second': '1977'} +{'loss': '0.5818', 'grad_norm': '1.115', 'learning_rate': '4.996e-05', 'epoch': '0.192', 'num_input_tokens_seen': 15606328, 'train_runtime': '7894', 'train_tokens_per_second': '1977'} +{'loss': '0.9415', 'grad_norm': '1.249', 'learning_rate': '4.996e-05', 'epoch': '0.192', 'num_input_tokens_seen': 15608375, 'train_runtime': '7895', 'train_tokens_per_second': '1977'} +{'loss': '0.5563', 'grad_norm': '1.442', 'learning_rate': '4.996e-05', 'epoch': '0.192', 'num_input_tokens_seen': 15610422, 'train_runtime': '7896', 'train_tokens_per_second': '1977'} +{'loss': '0.5928', 'grad_norm': '1.138', 'learning_rate': '4.996e-05', 'epoch': '0.192', 'num_input_tokens_seen': 15612469, 'train_runtime': '7897', 'train_tokens_per_second': '1977'} +{'loss': '0.8017', 'grad_norm': '1.399', 'learning_rate': '4.996e-05', 'epoch': '0.1921', 'num_input_tokens_seen': 15614516, 'train_runtime': '7898', 'train_tokens_per_second': '1977'} +{'loss': '0.2273', 'grad_norm': '0.9145', 'learning_rate': '4.996e-05', 'epoch': '0.1921', 'num_input_tokens_seen': 15616563, 'train_runtime': '7899', 'train_tokens_per_second': '1977'} +{'loss': '0.9707', 'grad_norm': '1.561', 'learning_rate': '4.996e-05', 'epoch': '0.1921', 'num_input_tokens_seen': 15618610, 'train_runtime': '7900', 'train_tokens_per_second': '1977'} +{'loss': '0.5925', 'grad_norm': '1.265', 'learning_rate': '4.996e-05', 'epoch': '0.1921', 'num_input_tokens_seen': 15620657, 'train_runtime': '7901', 'train_tokens_per_second': '1977'} +{'loss': '0.39', 'grad_norm': '0.9006', 'learning_rate': '4.996e-05', 'epoch': '0.1922', 'num_input_tokens_seen': 15622704, 'train_runtime': '7902', 'train_tokens_per_second': '1977'} +{'loss': '1.671', 'grad_norm': '2.117', 'learning_rate': '4.996e-05', 'epoch': '0.1922', 'num_input_tokens_seen': 15624751, 'train_runtime': '7903', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '1.877', 'learning_rate': '4.996e-05', 'epoch': '0.1922', 'num_input_tokens_seen': 15626798, 'train_runtime': '7904', 'train_tokens_per_second': '1977'} +{'loss': '1.271', 'grad_norm': '1.775', 'learning_rate': '4.996e-05', 'epoch': '0.1922', 'num_input_tokens_seen': 15628845, 'train_runtime': '7905', 'train_tokens_per_second': '1977'} +{'loss': '0.2931', 'grad_norm': '0.9088', 'learning_rate': '4.996e-05', 'epoch': '0.1923', 'num_input_tokens_seen': 15630892, 'train_runtime': '7906', 'train_tokens_per_second': '1977'} +{'loss': '0.2585', 'grad_norm': '0.8264', 'learning_rate': '4.996e-05', 'epoch': '0.1923', 'num_input_tokens_seen': 15632939, 'train_runtime': '7907', 'train_tokens_per_second': '1977'} +{'loss': '0.4252', 'grad_norm': '1.007', 'learning_rate': '4.996e-05', 'epoch': '0.1923', 'num_input_tokens_seen': 15634986, 'train_runtime': '7908', 'train_tokens_per_second': '1977'} +{'loss': '0.5604', 'grad_norm': '1.478', 'learning_rate': '4.996e-05', 'epoch': '0.1923', 'num_input_tokens_seen': 15637033, 'train_runtime': '7909', 'train_tokens_per_second': '1977'} +{'loss': '0.6326', 'grad_norm': '1.288', 'learning_rate': '4.996e-05', 'epoch': '0.1924', 'num_input_tokens_seen': 15639080, 'train_runtime': '7910', 'train_tokens_per_second': '1977'} +{'loss': '1.387', 'grad_norm': '1.806', 'learning_rate': '4.996e-05', 'epoch': '0.1924', 'num_input_tokens_seen': 15641127, 'train_runtime': '7911', 'train_tokens_per_second': '1977'} +{'loss': '0.9997', 'grad_norm': '1.466', 'learning_rate': '4.996e-05', 'epoch': '0.1924', 'num_input_tokens_seen': 15643174, 'train_runtime': '7913', 'train_tokens_per_second': '1977'} +{'loss': '0.4498', 'grad_norm': '0.9196', 'learning_rate': '4.996e-05', 'epoch': '0.1924', 'num_input_tokens_seen': 15645221, 'train_runtime': '7914', 'train_tokens_per_second': '1977'} +{'loss': '0.2954', 'grad_norm': '0.9792', 'learning_rate': '4.996e-05', 'epoch': '0.1925', 'num_input_tokens_seen': 15647268, 'train_runtime': '7915', 'train_tokens_per_second': '1977'} +{'loss': '0.6722', 'grad_norm': '1.015', 'learning_rate': '4.996e-05', 'epoch': '0.1925', 'num_input_tokens_seen': 15649315, 'train_runtime': '7916', 'train_tokens_per_second': '1977'} +{'loss': '0.4171', 'grad_norm': '1.02', 'learning_rate': '4.996e-05', 'epoch': '0.1925', 'num_input_tokens_seen': 15651362, 'train_runtime': '7917', 'train_tokens_per_second': '1977'} +{'loss': '1.104', 'grad_norm': '1.633', 'learning_rate': '4.996e-05', 'epoch': '0.1925', 'num_input_tokens_seen': 15653409, 'train_runtime': '7918', 'train_tokens_per_second': '1977'} +{'loss': '0.5868', 'grad_norm': '1.425', 'learning_rate': '4.996e-05', 'epoch': '0.1926', 'num_input_tokens_seen': 15655456, 'train_runtime': '7919', 'train_tokens_per_second': '1977'} +{'loss': '0.7404', 'grad_norm': '1.593', 'learning_rate': '4.996e-05', 'epoch': '0.1926', 'num_input_tokens_seen': 15657503, 'train_runtime': '7920', 'train_tokens_per_second': '1977'} +{'loss': '0.7837', 'grad_norm': '1.441', 'learning_rate': '4.996e-05', 'epoch': '0.1926', 'num_input_tokens_seen': 15659550, 'train_runtime': '7921', 'train_tokens_per_second': '1977'} +{'loss': '0.3505', 'grad_norm': '0.9128', 'learning_rate': '4.996e-05', 'epoch': '0.1926', 'num_input_tokens_seen': 15661597, 'train_runtime': '7922', 'train_tokens_per_second': '1977'} +{'loss': '0.9474', 'grad_norm': '1.466', 'learning_rate': '4.996e-05', 'epoch': '0.1927', 'num_input_tokens_seen': 15663644, 'train_runtime': '7923', 'train_tokens_per_second': '1977'} +{'loss': '0.5526', 'grad_norm': '1.342', 'learning_rate': '4.996e-05', 'epoch': '0.1927', 'num_input_tokens_seen': 15665691, 'train_runtime': '7924', 'train_tokens_per_second': '1977'} +{'loss': '1.104', 'grad_norm': '1.604', 'learning_rate': '4.996e-05', 'epoch': '0.1927', 'num_input_tokens_seen': 15667738, 'train_runtime': '7925', 'train_tokens_per_second': '1977'} +{'loss': '0.5289', 'grad_norm': '1.203', 'learning_rate': '4.996e-05', 'epoch': '0.1927', 'num_input_tokens_seen': 15669785, 'train_runtime': '7926', 'train_tokens_per_second': '1977'} +{'loss': '1.631', 'grad_norm': '1.997', 'learning_rate': '4.996e-05', 'epoch': '0.1928', 'num_input_tokens_seen': 15671832, 'train_runtime': '7927', 'train_tokens_per_second': '1977'} +{'loss': '0.3103', 'grad_norm': '1.122', 'learning_rate': '4.996e-05', 'epoch': '0.1928', 'num_input_tokens_seen': 15673879, 'train_runtime': '7928', 'train_tokens_per_second': '1977'} +{'loss': '0.281', 'grad_norm': '1.104', 'learning_rate': '4.996e-05', 'epoch': '0.1928', 'num_input_tokens_seen': 15675926, 'train_runtime': '7929', 'train_tokens_per_second': '1977'} +{'loss': '0.7177', 'grad_norm': '1.439', 'learning_rate': '4.996e-05', 'epoch': '0.1928', 'num_input_tokens_seen': 15677973, 'train_runtime': '7930', 'train_tokens_per_second': '1977'} +{'loss': '0.417', 'grad_norm': '1.116', 'learning_rate': '4.996e-05', 'epoch': '0.1929', 'num_input_tokens_seen': 15680020, 'train_runtime': '7931', 'train_tokens_per_second': '1977'} +{'loss': '0.7947', 'grad_norm': '1.334', 'learning_rate': '4.996e-05', 'epoch': '0.1929', 'num_input_tokens_seen': 15682067, 'train_runtime': '7932', 'train_tokens_per_second': '1977'} +{'loss': '0.241', 'grad_norm': '0.7654', 'learning_rate': '4.996e-05', 'epoch': '0.1929', 'num_input_tokens_seen': 15684114, 'train_runtime': '7933', 'train_tokens_per_second': '1977'} +{'loss': '0.2751', 'grad_norm': '0.9323', 'learning_rate': '4.996e-05', 'epoch': '0.1929', 'num_input_tokens_seen': 15686161, 'train_runtime': '7934', 'train_tokens_per_second': '1977'} +{'loss': '0.8373', 'grad_norm': '0.9896', 'learning_rate': '4.996e-05', 'epoch': '0.193', 'num_input_tokens_seen': 15688208, 'train_runtime': '7935', 'train_tokens_per_second': '1977'} +{'loss': '0.2791', 'grad_norm': '1.025', 'learning_rate': '4.996e-05', 'epoch': '0.193', 'num_input_tokens_seen': 15690255, 'train_runtime': '7936', 'train_tokens_per_second': '1977'} +{'loss': '2.138', 'grad_norm': '2.088', 'learning_rate': '4.996e-05', 'epoch': '0.193', 'num_input_tokens_seen': 15692302, 'train_runtime': '7937', 'train_tokens_per_second': '1977'} +{'loss': '0.6852', 'grad_norm': '1.093', 'learning_rate': '4.996e-05', 'epoch': '0.193', 'num_input_tokens_seen': 15694349, 'train_runtime': '7938', 'train_tokens_per_second': '1977'} +{'loss': '0.6028', 'grad_norm': '1.24', 'learning_rate': '4.996e-05', 'epoch': '0.1931', 'num_input_tokens_seen': 15696396, 'train_runtime': '7939', 'train_tokens_per_second': '1977'} +{'loss': '0.6327', 'grad_norm': '1.49', 'learning_rate': '4.996e-05', 'epoch': '0.1931', 'num_input_tokens_seen': 15698443, 'train_runtime': '7940', 'train_tokens_per_second': '1977'} +{'loss': '1.622', 'grad_norm': '2.514', 'learning_rate': '4.996e-05', 'epoch': '0.1931', 'num_input_tokens_seen': 15700490, 'train_runtime': '7941', 'train_tokens_per_second': '1977'} +{'loss': '0.2669', 'grad_norm': '0.8977', 'learning_rate': '4.996e-05', 'epoch': '0.1931', 'num_input_tokens_seen': 15702537, 'train_runtime': '7943', 'train_tokens_per_second': '1977'} +{'loss': '1.019', 'grad_norm': '2.003', 'learning_rate': '4.996e-05', 'epoch': '0.1932', 'num_input_tokens_seen': 15704584, 'train_runtime': '7944', 'train_tokens_per_second': '1977'} +{'loss': '0.8457', 'grad_norm': '1.307', 'learning_rate': '4.996e-05', 'epoch': '0.1932', 'num_input_tokens_seen': 15706631, 'train_runtime': '7945', 'train_tokens_per_second': '1977'} +{'loss': '1.334', 'grad_norm': '1.869', 'learning_rate': '4.996e-05', 'epoch': '0.1932', 'num_input_tokens_seen': 15708678, 'train_runtime': '7946', 'train_tokens_per_second': '1977'} +{'loss': '0.458', 'grad_norm': '1.273', 'learning_rate': '4.996e-05', 'epoch': '0.1932', 'num_input_tokens_seen': 15710725, 'train_runtime': '7947', 'train_tokens_per_second': '1977'} +{'loss': '0.7194', 'grad_norm': '1.086', 'learning_rate': '4.996e-05', 'epoch': '0.1933', 'num_input_tokens_seen': 15712772, 'train_runtime': '7948', 'train_tokens_per_second': '1977'} +{'loss': '1.128', 'grad_norm': '2.252', 'learning_rate': '4.996e-05', 'epoch': '0.1933', 'num_input_tokens_seen': 15714819, 'train_runtime': '7949', 'train_tokens_per_second': '1977'} +{'loss': '0.9099', 'grad_norm': '1.314', 'learning_rate': '4.996e-05', 'epoch': '0.1933', 'num_input_tokens_seen': 15716866, 'train_runtime': '7950', 'train_tokens_per_second': '1977'} +{'loss': '0.6468', 'grad_norm': '1.37', 'learning_rate': '4.996e-05', 'epoch': '0.1933', 'num_input_tokens_seen': 15718913, 'train_runtime': '7951', 'train_tokens_per_second': '1977'} +{'loss': '0.7499', 'grad_norm': '1.171', 'learning_rate': '4.996e-05', 'epoch': '0.1934', 'num_input_tokens_seen': 15720960, 'train_runtime': '7952', 'train_tokens_per_second': '1977'} +{'loss': '0.8022', 'grad_norm': '1.222', 'learning_rate': '4.996e-05', 'epoch': '0.1934', 'num_input_tokens_seen': 15723007, 'train_runtime': '7953', 'train_tokens_per_second': '1977'} +{'loss': '0.3628', 'grad_norm': '1.056', 'learning_rate': '4.996e-05', 'epoch': '0.1934', 'num_input_tokens_seen': 15725054, 'train_runtime': '7954', 'train_tokens_per_second': '1977'} +{'loss': '0.686', 'grad_norm': '1.301', 'learning_rate': '4.996e-05', 'epoch': '0.1934', 'num_input_tokens_seen': 15727101, 'train_runtime': '7955', 'train_tokens_per_second': '1977'} +{'loss': '0.4309', 'grad_norm': '1.065', 'learning_rate': '4.996e-05', 'epoch': '0.1935', 'num_input_tokens_seen': 15729148, 'train_runtime': '7956', 'train_tokens_per_second': '1977'} +{'loss': '1.606', 'grad_norm': '2.624', 'learning_rate': '4.996e-05', 'epoch': '0.1935', 'num_input_tokens_seen': 15731195, 'train_runtime': '7957', 'train_tokens_per_second': '1977'} +{'loss': '1.973', 'grad_norm': '2.934', 'learning_rate': '4.996e-05', 'epoch': '0.1935', 'num_input_tokens_seen': 15733242, 'train_runtime': '7958', 'train_tokens_per_second': '1977'} +{'loss': '1.624', 'grad_norm': '2.421', 'learning_rate': '4.996e-05', 'epoch': '0.1935', 'num_input_tokens_seen': 15735289, 'train_runtime': '7959', 'train_tokens_per_second': '1977'} +{'loss': '0.3481', 'grad_norm': '1.011', 'learning_rate': '4.996e-05', 'epoch': '0.1936', 'num_input_tokens_seen': 15737336, 'train_runtime': '7960', 'train_tokens_per_second': '1977'} +{'loss': '0.3563', 'grad_norm': '1.022', 'learning_rate': '4.996e-05', 'epoch': '0.1936', 'num_input_tokens_seen': 15739383, 'train_runtime': '7961', 'train_tokens_per_second': '1977'} +{'loss': '1.298', 'grad_norm': '2.108', 'learning_rate': '4.996e-05', 'epoch': '0.1936', 'num_input_tokens_seen': 15741430, 'train_runtime': '7962', 'train_tokens_per_second': '1977'} +{'loss': '1.834', 'grad_norm': '2.396', 'learning_rate': '4.995e-05', 'epoch': '0.1936', 'num_input_tokens_seen': 15743477, 'train_runtime': '7963', 'train_tokens_per_second': '1977'} +{'loss': '0.4105', 'grad_norm': '1.067', 'learning_rate': '4.995e-05', 'epoch': '0.1937', 'num_input_tokens_seen': 15745524, 'train_runtime': '7964', 'train_tokens_per_second': '1977'} +{'loss': '0.3626', 'grad_norm': '0.8713', 'learning_rate': '4.995e-05', 'epoch': '0.1937', 'num_input_tokens_seen': 15747571, 'train_runtime': '7965', 'train_tokens_per_second': '1977'} +{'loss': '0.5254', 'grad_norm': '1.242', 'learning_rate': '4.995e-05', 'epoch': '0.1937', 'num_input_tokens_seen': 15749618, 'train_runtime': '7966', 'train_tokens_per_second': '1977'} +{'loss': '0.3501', 'grad_norm': '0.9984', 'learning_rate': '4.995e-05', 'epoch': '0.1937', 'num_input_tokens_seen': 15751665, 'train_runtime': '7967', 'train_tokens_per_second': '1977'} +{'loss': '0.4163', 'grad_norm': '1.194', 'learning_rate': '4.995e-05', 'epoch': '0.1938', 'num_input_tokens_seen': 15753712, 'train_runtime': '7968', 'train_tokens_per_second': '1977'} +{'loss': '0.3037', 'grad_norm': '0.8153', 'learning_rate': '4.995e-05', 'epoch': '0.1938', 'num_input_tokens_seen': 15755759, 'train_runtime': '7969', 'train_tokens_per_second': '1977'} +{'loss': '0.3454', 'grad_norm': '0.9208', 'learning_rate': '4.995e-05', 'epoch': '0.1938', 'num_input_tokens_seen': 15757806, 'train_runtime': '7970', 'train_tokens_per_second': '1977'} +{'loss': '0.4545', 'grad_norm': '1.163', 'learning_rate': '4.995e-05', 'epoch': '0.1938', 'num_input_tokens_seen': 15759853, 'train_runtime': '7972', 'train_tokens_per_second': '1977'} +{'loss': '2.112', 'grad_norm': '4.248', 'learning_rate': '4.995e-05', 'epoch': '0.1939', 'num_input_tokens_seen': 15761900, 'train_runtime': '7973', 'train_tokens_per_second': '1977'} +{'loss': '0.5841', 'grad_norm': '1.13', 'learning_rate': '4.995e-05', 'epoch': '0.1939', 'num_input_tokens_seen': 15763947, 'train_runtime': '7974', 'train_tokens_per_second': '1977'} +{'loss': '0.7235', 'grad_norm': '0.975', 'learning_rate': '4.995e-05', 'epoch': '0.1939', 'num_input_tokens_seen': 15765994, 'train_runtime': '7975', 'train_tokens_per_second': '1977'} +{'loss': '0.4271', 'grad_norm': '1.287', 'learning_rate': '4.995e-05', 'epoch': '0.1939', 'num_input_tokens_seen': 15768041, 'train_runtime': '7976', 'train_tokens_per_second': '1977'} +{'loss': '0.3656', 'grad_norm': '1.104', 'learning_rate': '4.995e-05', 'epoch': '0.194', 'num_input_tokens_seen': 15770088, 'train_runtime': '7977', 'train_tokens_per_second': '1977'} +{'loss': '0.8519', 'grad_norm': '1.932', 'learning_rate': '4.995e-05', 'epoch': '0.194', 'num_input_tokens_seen': 15772135, 'train_runtime': '7978', 'train_tokens_per_second': '1977'} +{'loss': '1.095', 'grad_norm': '1.718', 'learning_rate': '4.995e-05', 'epoch': '0.194', 'num_input_tokens_seen': 15774182, 'train_runtime': '7979', 'train_tokens_per_second': '1977'} +{'loss': '0.7777', 'grad_norm': '1.437', 'learning_rate': '4.995e-05', 'epoch': '0.194', 'num_input_tokens_seen': 15776229, 'train_runtime': '7980', 'train_tokens_per_second': '1977'} +{'loss': '0.7622', 'grad_norm': '1.037', 'learning_rate': '4.995e-05', 'epoch': '0.1941', 'num_input_tokens_seen': 15778276, 'train_runtime': '7981', 'train_tokens_per_second': '1977'} +{'loss': '0.8269', 'grad_norm': '1.376', 'learning_rate': '4.995e-05', 'epoch': '0.1941', 'num_input_tokens_seen': 15780323, 'train_runtime': '7982', 'train_tokens_per_second': '1977'} +{'loss': '0.623', 'grad_norm': '0.9679', 'learning_rate': '4.995e-05', 'epoch': '0.1941', 'num_input_tokens_seen': 15782370, 'train_runtime': '7983', 'train_tokens_per_second': '1977'} +{'loss': '1.144', 'grad_norm': '2.062', 'learning_rate': '4.995e-05', 'epoch': '0.1941', 'num_input_tokens_seen': 15784417, 'train_runtime': '7984', 'train_tokens_per_second': '1977'} +{'loss': '0.4312', 'grad_norm': '1.092', 'learning_rate': '4.995e-05', 'epoch': '0.1942', 'num_input_tokens_seen': 15786464, 'train_runtime': '7985', 'train_tokens_per_second': '1977'} +{'loss': '0.485', 'grad_norm': '1.08', 'learning_rate': '4.995e-05', 'epoch': '0.1942', 'num_input_tokens_seen': 15788511, 'train_runtime': '7986', 'train_tokens_per_second': '1977'} +{'loss': '1.307', 'grad_norm': '1.682', 'learning_rate': '4.995e-05', 'epoch': '0.1942', 'num_input_tokens_seen': 15790558, 'train_runtime': '7987', 'train_tokens_per_second': '1977'} +{'loss': '1.027', 'grad_norm': '1.525', 'learning_rate': '4.995e-05', 'epoch': '0.1942', 'num_input_tokens_seen': 15792605, 'train_runtime': '7988', 'train_tokens_per_second': '1977'} +{'loss': '1.01', 'grad_norm': '2.133', 'learning_rate': '4.995e-05', 'epoch': '0.1943', 'num_input_tokens_seen': 15794652, 'train_runtime': '7989', 'train_tokens_per_second': '1977'} +{'loss': '0.5112', 'grad_norm': '1.289', 'learning_rate': '4.995e-05', 'epoch': '0.1943', 'num_input_tokens_seen': 15796699, 'train_runtime': '7990', 'train_tokens_per_second': '1977'} +{'loss': '0.9651', 'grad_norm': '1.479', 'learning_rate': '4.995e-05', 'epoch': '0.1943', 'num_input_tokens_seen': 15798746, 'train_runtime': '7991', 'train_tokens_per_second': '1977'} +{'loss': '0.6181', 'grad_norm': '1.271', 'learning_rate': '4.995e-05', 'epoch': '0.1944', 'num_input_tokens_seen': 15800793, 'train_runtime': '7992', 'train_tokens_per_second': '1977'} +{'loss': '1.143', 'grad_norm': '1.637', 'learning_rate': '4.995e-05', 'epoch': '0.1944', 'num_input_tokens_seen': 15802840, 'train_runtime': '7993', 'train_tokens_per_second': '1977'} +{'loss': '1.351', 'grad_norm': '2.431', 'learning_rate': '4.995e-05', 'epoch': '0.1944', 'num_input_tokens_seen': 15804887, 'train_runtime': '7994', 'train_tokens_per_second': '1977'} +{'loss': '0.8673', 'grad_norm': '1.306', 'learning_rate': '4.995e-05', 'epoch': '0.1944', 'num_input_tokens_seen': 15806934, 'train_runtime': '7995', 'train_tokens_per_second': '1977'} +{'loss': '0.7854', 'grad_norm': '1.491', 'learning_rate': '4.995e-05', 'epoch': '0.1945', 'num_input_tokens_seen': 15808981, 'train_runtime': '7996', 'train_tokens_per_second': '1977'} +{'loss': '2.352', 'grad_norm': '2.942', 'learning_rate': '4.995e-05', 'epoch': '0.1945', 'num_input_tokens_seen': 15811028, 'train_runtime': '7997', 'train_tokens_per_second': '1977'} +{'loss': '0.7897', 'grad_norm': '1.382', 'learning_rate': '4.995e-05', 'epoch': '0.1945', 'num_input_tokens_seen': 15813075, 'train_runtime': '7998', 'train_tokens_per_second': '1977'} +{'loss': '0.3494', 'grad_norm': '1.034', 'learning_rate': '4.995e-05', 'epoch': '0.1945', 'num_input_tokens_seen': 15815122, 'train_runtime': '7999', 'train_tokens_per_second': '1977'} +{'loss': '0.9337', 'grad_norm': '1.612', 'learning_rate': '4.995e-05', 'epoch': '0.1946', 'num_input_tokens_seen': 15817169, 'train_runtime': '8000', 'train_tokens_per_second': '1977'} +{'loss': '0.9596', 'grad_norm': '1.303', 'learning_rate': '4.995e-05', 'epoch': '0.1946', 'num_input_tokens_seen': 15819216, 'train_runtime': '8002', 'train_tokens_per_second': '1977'} +{'loss': '1.218', 'grad_norm': '2.341', 'learning_rate': '4.995e-05', 'epoch': '0.1946', 'num_input_tokens_seen': 15821263, 'train_runtime': '8003', 'train_tokens_per_second': '1977'} +{'loss': '0.3456', 'grad_norm': '0.9166', 'learning_rate': '4.995e-05', 'epoch': '0.1946', 'num_input_tokens_seen': 15823310, 'train_runtime': '8004', 'train_tokens_per_second': '1977'} +{'loss': '0.3632', 'grad_norm': '1.034', 'learning_rate': '4.995e-05', 'epoch': '0.1947', 'num_input_tokens_seen': 15825357, 'train_runtime': '8005', 'train_tokens_per_second': '1977'} +{'loss': '0.5839', 'grad_norm': '1.234', 'learning_rate': '4.995e-05', 'epoch': '0.1947', 'num_input_tokens_seen': 15827404, 'train_runtime': '8006', 'train_tokens_per_second': '1977'} +{'loss': '0.8154', 'grad_norm': '1.467', 'learning_rate': '4.995e-05', 'epoch': '0.1947', 'num_input_tokens_seen': 15829451, 'train_runtime': '8007', 'train_tokens_per_second': '1977'} +{'loss': '0.8684', 'grad_norm': '1.394', 'learning_rate': '4.995e-05', 'epoch': '0.1947', 'num_input_tokens_seen': 15831498, 'train_runtime': '8008', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.215', 'learning_rate': '4.995e-05', 'epoch': '0.1948', 'num_input_tokens_seen': 15833545, 'train_runtime': '8009', 'train_tokens_per_second': '1977'} +{'loss': '0.6438', 'grad_norm': '1.36', 'learning_rate': '4.995e-05', 'epoch': '0.1948', 'num_input_tokens_seen': 15835592, 'train_runtime': '8010', 'train_tokens_per_second': '1977'} +{'loss': '0.4945', 'grad_norm': '1.33', 'learning_rate': '4.995e-05', 'epoch': '0.1948', 'num_input_tokens_seen': 15837639, 'train_runtime': '8011', 'train_tokens_per_second': '1977'} +{'loss': '0.9026', 'grad_norm': '1.429', 'learning_rate': '4.995e-05', 'epoch': '0.1948', 'num_input_tokens_seen': 15839686, 'train_runtime': '8012', 'train_tokens_per_second': '1977'} +{'loss': '1.132', 'grad_norm': '1.82', 'learning_rate': '4.995e-05', 'epoch': '0.1949', 'num_input_tokens_seen': 15841733, 'train_runtime': '8013', 'train_tokens_per_second': '1977'} +{'loss': '1.248', 'grad_norm': '2.273', 'learning_rate': '4.995e-05', 'epoch': '0.1949', 'num_input_tokens_seen': 15843780, 'train_runtime': '8014', 'train_tokens_per_second': '1977'} +{'loss': '1.709', 'grad_norm': '2.093', 'learning_rate': '4.995e-05', 'epoch': '0.1949', 'num_input_tokens_seen': 15845827, 'train_runtime': '8015', 'train_tokens_per_second': '1977'} +{'loss': '0.3531', 'grad_norm': '1.057', 'learning_rate': '4.995e-05', 'epoch': '0.1949', 'num_input_tokens_seen': 15847874, 'train_runtime': '8016', 'train_tokens_per_second': '1977'} +{'loss': '0.2471', 'grad_norm': '0.9552', 'learning_rate': '4.995e-05', 'epoch': '0.195', 'num_input_tokens_seen': 15849921, 'train_runtime': '8017', 'train_tokens_per_second': '1977'} +{'loss': '0.3682', 'grad_norm': '1.088', 'learning_rate': '4.995e-05', 'epoch': '0.195', 'num_input_tokens_seen': 15851968, 'train_runtime': '8018', 'train_tokens_per_second': '1977'} +{'loss': '2.01', 'grad_norm': '2.348', 'learning_rate': '4.995e-05', 'epoch': '0.195', 'num_input_tokens_seen': 15854015, 'train_runtime': '8019', 'train_tokens_per_second': '1977'} +{'loss': '0.9141', 'grad_norm': '1.597', 'learning_rate': '4.995e-05', 'epoch': '0.195', 'num_input_tokens_seen': 15856062, 'train_runtime': '8020', 'train_tokens_per_second': '1977'} +{'loss': '2.306', 'grad_norm': '3.168', 'learning_rate': '4.995e-05', 'epoch': '0.1951', 'num_input_tokens_seen': 15858109, 'train_runtime': '8021', 'train_tokens_per_second': '1977'} +{'loss': '1.42', 'grad_norm': '1.886', 'learning_rate': '4.995e-05', 'epoch': '0.1951', 'num_input_tokens_seen': 15860156, 'train_runtime': '8022', 'train_tokens_per_second': '1977'} +{'loss': '0.5809', 'grad_norm': '1.353', 'learning_rate': '4.995e-05', 'epoch': '0.1951', 'num_input_tokens_seen': 15862203, 'train_runtime': '8023', 'train_tokens_per_second': '1977'} +{'loss': '0.835', 'grad_norm': '1.24', 'learning_rate': '4.995e-05', 'epoch': '0.1951', 'num_input_tokens_seen': 15864250, 'train_runtime': '8024', 'train_tokens_per_second': '1977'} +{'loss': '0.246', 'grad_norm': '0.8723', 'learning_rate': '4.995e-05', 'epoch': '0.1952', 'num_input_tokens_seen': 15866297, 'train_runtime': '8025', 'train_tokens_per_second': '1977'} +{'loss': '0.5582', 'grad_norm': '1.315', 'learning_rate': '4.995e-05', 'epoch': '0.1952', 'num_input_tokens_seen': 15868344, 'train_runtime': '8026', 'train_tokens_per_second': '1977'} +{'loss': '0.4209', 'grad_norm': '1.147', 'learning_rate': '4.995e-05', 'epoch': '0.1952', 'num_input_tokens_seen': 15870391, 'train_runtime': '8027', 'train_tokens_per_second': '1977'} +{'loss': '1.405', 'grad_norm': '2.844', 'learning_rate': '4.995e-05', 'epoch': '0.1952', 'num_input_tokens_seen': 15872438, 'train_runtime': '8028', 'train_tokens_per_second': '1977'} +{'loss': '0.5095', 'grad_norm': '1.102', 'learning_rate': '4.995e-05', 'epoch': '0.1953', 'num_input_tokens_seen': 15874485, 'train_runtime': '8029', 'train_tokens_per_second': '1977'} +{'loss': '0.7168', 'grad_norm': '1.405', 'learning_rate': '4.995e-05', 'epoch': '0.1953', 'num_input_tokens_seen': 15876532, 'train_runtime': '8030', 'train_tokens_per_second': '1977'} +{'loss': '0.3489', 'grad_norm': '0.852', 'learning_rate': '4.995e-05', 'epoch': '0.1953', 'num_input_tokens_seen': 15878579, 'train_runtime': '8032', 'train_tokens_per_second': '1977'} +{'loss': '0.6493', 'grad_norm': '1.183', 'learning_rate': '4.995e-05', 'epoch': '0.1953', 'num_input_tokens_seen': 15880626, 'train_runtime': '8033', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '1.658', 'learning_rate': '4.995e-05', 'epoch': '0.1954', 'num_input_tokens_seen': 15882673, 'train_runtime': '8034', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '2.065', 'learning_rate': '4.995e-05', 'epoch': '0.1954', 'num_input_tokens_seen': 15884720, 'train_runtime': '8035', 'train_tokens_per_second': '1977'} +{'loss': '1.301', 'grad_norm': '1.92', 'learning_rate': '4.995e-05', 'epoch': '0.1954', 'num_input_tokens_seen': 15886767, 'train_runtime': '8036', 'train_tokens_per_second': '1977'} +{'loss': '0.8699', 'grad_norm': '1.589', 'learning_rate': '4.995e-05', 'epoch': '0.1954', 'num_input_tokens_seen': 15888814, 'train_runtime': '8037', 'train_tokens_per_second': '1977'} +{'loss': '2.134', 'grad_norm': '2.627', 'learning_rate': '4.995e-05', 'epoch': '0.1955', 'num_input_tokens_seen': 15890861, 'train_runtime': '8038', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '1.915', 'learning_rate': '4.995e-05', 'epoch': '0.1955', 'num_input_tokens_seen': 15892908, 'train_runtime': '8039', 'train_tokens_per_second': '1977'} +{'loss': '1.466', 'grad_norm': '2.267', 'learning_rate': '4.995e-05', 'epoch': '0.1955', 'num_input_tokens_seen': 15894955, 'train_runtime': '8040', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '2.048', 'learning_rate': '4.995e-05', 'epoch': '0.1955', 'num_input_tokens_seen': 15897002, 'train_runtime': '8041', 'train_tokens_per_second': '1977'} +{'loss': '0.8302', 'grad_norm': '1.122', 'learning_rate': '4.995e-05', 'epoch': '0.1956', 'num_input_tokens_seen': 15899049, 'train_runtime': '8042', 'train_tokens_per_second': '1977'} +{'loss': '1.336', 'grad_norm': '2.035', 'learning_rate': '4.995e-05', 'epoch': '0.1956', 'num_input_tokens_seen': 15901096, 'train_runtime': '8043', 'train_tokens_per_second': '1977'} +{'loss': '1.094', 'grad_norm': '1.218', 'learning_rate': '4.995e-05', 'epoch': '0.1956', 'num_input_tokens_seen': 15903143, 'train_runtime': '8044', 'train_tokens_per_second': '1977'} +{'loss': '0.7434', 'grad_norm': '1.352', 'learning_rate': '4.995e-05', 'epoch': '0.1956', 'num_input_tokens_seen': 15905190, 'train_runtime': '8045', 'train_tokens_per_second': '1977'} +{'loss': '0.4711', 'grad_norm': '0.8965', 'learning_rate': '4.995e-05', 'epoch': '0.1957', 'num_input_tokens_seen': 15907237, 'train_runtime': '8046', 'train_tokens_per_second': '1977'} +{'loss': '0.4208', 'grad_norm': '0.9619', 'learning_rate': '4.995e-05', 'epoch': '0.1957', 'num_input_tokens_seen': 15909284, 'train_runtime': '8047', 'train_tokens_per_second': '1977'} +{'loss': '0.3652', 'grad_norm': '1.126', 'learning_rate': '4.995e-05', 'epoch': '0.1957', 'num_input_tokens_seen': 15911331, 'train_runtime': '8048', 'train_tokens_per_second': '1977'} +{'loss': '0.4274', 'grad_norm': '0.9372', 'learning_rate': '4.995e-05', 'epoch': '0.1957', 'num_input_tokens_seen': 15913378, 'train_runtime': '8049', 'train_tokens_per_second': '1977'} +{'loss': '0.6732', 'grad_norm': '1.424', 'learning_rate': '4.995e-05', 'epoch': '0.1958', 'num_input_tokens_seen': 15915425, 'train_runtime': '8050', 'train_tokens_per_second': '1977'} +{'loss': '0.6932', 'grad_norm': '1.135', 'learning_rate': '4.995e-05', 'epoch': '0.1958', 'num_input_tokens_seen': 15917472, 'train_runtime': '8051', 'train_tokens_per_second': '1977'} +{'loss': '0.3996', 'grad_norm': '1.045', 'learning_rate': '4.995e-05', 'epoch': '0.1958', 'num_input_tokens_seen': 15919519, 'train_runtime': '8052', 'train_tokens_per_second': '1977'} +{'loss': '0.2531', 'grad_norm': '0.8693', 'learning_rate': '4.995e-05', 'epoch': '0.1958', 'num_input_tokens_seen': 15921566, 'train_runtime': '8053', 'train_tokens_per_second': '1977'} +{'loss': '0.6251', 'grad_norm': '1.242', 'learning_rate': '4.995e-05', 'epoch': '0.1959', 'num_input_tokens_seen': 15923613, 'train_runtime': '8054', 'train_tokens_per_second': '1977'} +{'loss': '0.3565', 'grad_norm': '1.096', 'learning_rate': '4.995e-05', 'epoch': '0.1959', 'num_input_tokens_seen': 15925660, 'train_runtime': '8055', 'train_tokens_per_second': '1977'} +{'loss': '0.3303', 'grad_norm': '0.9536', 'learning_rate': '4.995e-05', 'epoch': '0.1959', 'num_input_tokens_seen': 15927707, 'train_runtime': '8056', 'train_tokens_per_second': '1977'} +{'loss': '0.6882', 'grad_norm': '1.071', 'learning_rate': '4.995e-05', 'epoch': '0.1959', 'num_input_tokens_seen': 15929754, 'train_runtime': '8057', 'train_tokens_per_second': '1977'} +{'loss': '0.5415', 'grad_norm': '1.303', 'learning_rate': '4.995e-05', 'epoch': '0.196', 'num_input_tokens_seen': 15931801, 'train_runtime': '8058', 'train_tokens_per_second': '1977'} +{'loss': '0.6354', 'grad_norm': '1.314', 'learning_rate': '4.995e-05', 'epoch': '0.196', 'num_input_tokens_seen': 15933848, 'train_runtime': '8059', 'train_tokens_per_second': '1977'} +{'loss': '0.7381', 'grad_norm': '1.703', 'learning_rate': '4.995e-05', 'epoch': '0.196', 'num_input_tokens_seen': 15935895, 'train_runtime': '8061', 'train_tokens_per_second': '1977'} +{'loss': '0.6781', 'grad_norm': '1.023', 'learning_rate': '4.995e-05', 'epoch': '0.196', 'num_input_tokens_seen': 15937942, 'train_runtime': '8062', 'train_tokens_per_second': '1977'} +{'loss': '0.5167', 'grad_norm': '1.213', 'learning_rate': '4.995e-05', 'epoch': '0.1961', 'num_input_tokens_seen': 15939989, 'train_runtime': '8063', 'train_tokens_per_second': '1977'} +{'loss': '1.381', 'grad_norm': '2.098', 'learning_rate': '4.995e-05', 'epoch': '0.1961', 'num_input_tokens_seen': 15942036, 'train_runtime': '8064', 'train_tokens_per_second': '1977'} +{'loss': '0.4277', 'grad_norm': '1.02', 'learning_rate': '4.995e-05', 'epoch': '0.1961', 'num_input_tokens_seen': 15944083, 'train_runtime': '8065', 'train_tokens_per_second': '1977'} +{'loss': '0.7401', 'grad_norm': '1.307', 'learning_rate': '4.995e-05', 'epoch': '0.1961', 'num_input_tokens_seen': 15946130, 'train_runtime': '8066', 'train_tokens_per_second': '1977'} +{'loss': '1.906', 'grad_norm': '2.102', 'learning_rate': '4.995e-05', 'epoch': '0.1962', 'num_input_tokens_seen': 15948177, 'train_runtime': '8067', 'train_tokens_per_second': '1977'} +{'loss': '0.5753', 'grad_norm': '1.042', 'learning_rate': '4.995e-05', 'epoch': '0.1962', 'num_input_tokens_seen': 15950224, 'train_runtime': '8068', 'train_tokens_per_second': '1977'} +{'loss': '2.302', 'grad_norm': '2.687', 'learning_rate': '4.995e-05', 'epoch': '0.1962', 'num_input_tokens_seen': 15952271, 'train_runtime': '8069', 'train_tokens_per_second': '1977'} +{'loss': '1.741', 'grad_norm': '2.297', 'learning_rate': '4.995e-05', 'epoch': '0.1962', 'num_input_tokens_seen': 15954318, 'train_runtime': '8070', 'train_tokens_per_second': '1977'} +{'loss': '0.756', 'grad_norm': '1.427', 'learning_rate': '4.995e-05', 'epoch': '0.1963', 'num_input_tokens_seen': 15956365, 'train_runtime': '8071', 'train_tokens_per_second': '1977'} +{'loss': '1.121', 'grad_norm': '1.574', 'learning_rate': '4.995e-05', 'epoch': '0.1963', 'num_input_tokens_seen': 15958412, 'train_runtime': '8072', 'train_tokens_per_second': '1977'} +{'loss': '0.5503', 'grad_norm': '1.251', 'learning_rate': '4.995e-05', 'epoch': '0.1963', 'num_input_tokens_seen': 15960459, 'train_runtime': '8073', 'train_tokens_per_second': '1977'} +{'loss': '0.631', 'grad_norm': '1.318', 'learning_rate': '4.995e-05', 'epoch': '0.1963', 'num_input_tokens_seen': 15962506, 'train_runtime': '8074', 'train_tokens_per_second': '1977'} +{'loss': '0.8101', 'grad_norm': '1.377', 'learning_rate': '4.995e-05', 'epoch': '0.1964', 'num_input_tokens_seen': 15964553, 'train_runtime': '8075', 'train_tokens_per_second': '1977'} +{'loss': '1.47', 'grad_norm': '2.311', 'learning_rate': '4.995e-05', 'epoch': '0.1964', 'num_input_tokens_seen': 15966600, 'train_runtime': '8076', 'train_tokens_per_second': '1977'} +{'loss': '0.3766', 'grad_norm': '1.048', 'learning_rate': '4.995e-05', 'epoch': '0.1964', 'num_input_tokens_seen': 15968647, 'train_runtime': '8077', 'train_tokens_per_second': '1977'} +{'loss': '0.707', 'grad_norm': '1.361', 'learning_rate': '4.995e-05', 'epoch': '0.1964', 'num_input_tokens_seen': 15970694, 'train_runtime': '8078', 'train_tokens_per_second': '1977'} +{'loss': '0.4373', 'grad_norm': '1.159', 'learning_rate': '4.995e-05', 'epoch': '0.1965', 'num_input_tokens_seen': 15972741, 'train_runtime': '8079', 'train_tokens_per_second': '1977'} +{'loss': '0.7301', 'grad_norm': '1.515', 'learning_rate': '4.995e-05', 'epoch': '0.1965', 'num_input_tokens_seen': 15974788, 'train_runtime': '8080', 'train_tokens_per_second': '1977'} +{'loss': '0.5849', 'grad_norm': '1.435', 'learning_rate': '4.995e-05', 'epoch': '0.1965', 'num_input_tokens_seen': 15976835, 'train_runtime': '8081', 'train_tokens_per_second': '1977'} +{'loss': '0.6746', 'grad_norm': '1.368', 'learning_rate': '4.995e-05', 'epoch': '0.1965', 'num_input_tokens_seen': 15978882, 'train_runtime': '8082', 'train_tokens_per_second': '1977'} +{'loss': '0.9405', 'grad_norm': '1.945', 'learning_rate': '4.995e-05', 'epoch': '0.1966', 'num_input_tokens_seen': 15980929, 'train_runtime': '8083', 'train_tokens_per_second': '1977'} +{'loss': '0.3237', 'grad_norm': '0.96', 'learning_rate': '4.995e-05', 'epoch': '0.1966', 'num_input_tokens_seen': 15982976, 'train_runtime': '8084', 'train_tokens_per_second': '1977'} +{'loss': '1.274', 'grad_norm': '2.039', 'learning_rate': '4.995e-05', 'epoch': '0.1966', 'num_input_tokens_seen': 15985023, 'train_runtime': '8085', 'train_tokens_per_second': '1977'} +{'loss': '2.054', 'grad_norm': '3.201', 'learning_rate': '4.995e-05', 'epoch': '0.1966', 'num_input_tokens_seen': 15987070, 'train_runtime': '8087', 'train_tokens_per_second': '1977'} +{'loss': '0.3709', 'grad_norm': '0.9342', 'learning_rate': '4.995e-05', 'epoch': '0.1967', 'num_input_tokens_seen': 15989117, 'train_runtime': '8088', 'train_tokens_per_second': '1977'} +{'loss': '0.6796', 'grad_norm': '1.103', 'learning_rate': '4.995e-05', 'epoch': '0.1967', 'num_input_tokens_seen': 15991164, 'train_runtime': '8089', 'train_tokens_per_second': '1977'} +{'loss': '0.8356', 'grad_norm': '1.54', 'learning_rate': '4.995e-05', 'epoch': '0.1967', 'num_input_tokens_seen': 15993211, 'train_runtime': '8090', 'train_tokens_per_second': '1977'} +{'loss': '1.469', 'grad_norm': '2.166', 'learning_rate': '4.995e-05', 'epoch': '0.1967', 'num_input_tokens_seen': 15995258, 'train_runtime': '8091', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '1.34', 'learning_rate': '4.995e-05', 'epoch': '0.1968', 'num_input_tokens_seen': 15997305, 'train_runtime': '8092', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '2.069', 'learning_rate': '4.995e-05', 'epoch': '0.1968', 'num_input_tokens_seen': 15999352, 'train_runtime': '8093', 'train_tokens_per_second': '1977'} +{'loss': '0.8847', 'grad_norm': '1.397', 'learning_rate': '4.995e-05', 'epoch': '0.1968', 'num_input_tokens_seen': 16001399, 'train_runtime': '8094', 'train_tokens_per_second': '1977'} +{'loss': '2.219', 'grad_norm': '2.48', 'learning_rate': '4.995e-05', 'epoch': '0.1968', 'num_input_tokens_seen': 16003446, 'train_runtime': '8095', 'train_tokens_per_second': '1977'} +{'loss': '0.6179', 'grad_norm': '1.142', 'learning_rate': '4.995e-05', 'epoch': '0.1969', 'num_input_tokens_seen': 16005493, 'train_runtime': '8096', 'train_tokens_per_second': '1977'} +{'loss': '0.8324', 'grad_norm': '1.39', 'learning_rate': '4.995e-05', 'epoch': '0.1969', 'num_input_tokens_seen': 16007540, 'train_runtime': '8097', 'train_tokens_per_second': '1977'} +{'loss': '0.6726', 'grad_norm': '1.208', 'learning_rate': '4.995e-05', 'epoch': '0.1969', 'num_input_tokens_seen': 16009587, 'train_runtime': '8098', 'train_tokens_per_second': '1977'} +{'loss': '0.3394', 'grad_norm': '0.965', 'learning_rate': '4.995e-05', 'epoch': '0.1969', 'num_input_tokens_seen': 16011634, 'train_runtime': '8099', 'train_tokens_per_second': '1977'} +{'loss': '0.2928', 'grad_norm': '0.9062', 'learning_rate': '4.995e-05', 'epoch': '0.197', 'num_input_tokens_seen': 16013681, 'train_runtime': '8100', 'train_tokens_per_second': '1977'} +{'loss': '0.4939', 'grad_norm': '1.138', 'learning_rate': '4.995e-05', 'epoch': '0.197', 'num_input_tokens_seen': 16015728, 'train_runtime': '8101', 'train_tokens_per_second': '1977'} +{'loss': '2.165', 'grad_norm': '3.141', 'learning_rate': '4.995e-05', 'epoch': '0.197', 'num_input_tokens_seen': 16017775, 'train_runtime': '8102', 'train_tokens_per_second': '1977'} +{'loss': '0.5633', 'grad_norm': '1.249', 'learning_rate': '4.995e-05', 'epoch': '0.197', 'num_input_tokens_seen': 16019822, 'train_runtime': '8103', 'train_tokens_per_second': '1977'} +{'loss': '0.4499', 'grad_norm': '1.048', 'learning_rate': '4.995e-05', 'epoch': '0.1971', 'num_input_tokens_seen': 16021869, 'train_runtime': '8104', 'train_tokens_per_second': '1977'} +{'loss': '0.2835', 'grad_norm': '0.8794', 'learning_rate': '4.995e-05', 'epoch': '0.1971', 'num_input_tokens_seen': 16023916, 'train_runtime': '8105', 'train_tokens_per_second': '1977'} +{'loss': '0.7501', 'grad_norm': '0.8735', 'learning_rate': '4.995e-05', 'epoch': '0.1971', 'num_input_tokens_seen': 16025963, 'train_runtime': '8106', 'train_tokens_per_second': '1977'} +{'loss': '0.9465', 'grad_norm': '1.586', 'learning_rate': '4.995e-05', 'epoch': '0.1971', 'num_input_tokens_seen': 16028010, 'train_runtime': '8107', 'train_tokens_per_second': '1977'} +{'loss': '0.9919', 'grad_norm': '1.812', 'learning_rate': '4.995e-05', 'epoch': '0.1972', 'num_input_tokens_seen': 16030057, 'train_runtime': '8108', 'train_tokens_per_second': '1977'} +{'loss': '0.5161', 'grad_norm': '1.032', 'learning_rate': '4.995e-05', 'epoch': '0.1972', 'num_input_tokens_seen': 16032104, 'train_runtime': '8109', 'train_tokens_per_second': '1977'} +{'loss': '1.447', 'grad_norm': '1.702', 'learning_rate': '4.995e-05', 'epoch': '0.1972', 'num_input_tokens_seen': 16034151, 'train_runtime': '8110', 'train_tokens_per_second': '1977'} +{'loss': '0.8021', 'grad_norm': '1.676', 'learning_rate': '4.995e-05', 'epoch': '0.1972', 'num_input_tokens_seen': 16036198, 'train_runtime': '8111', 'train_tokens_per_second': '1977'} +{'loss': '0.2904', 'grad_norm': '0.9061', 'learning_rate': '4.995e-05', 'epoch': '0.1973', 'num_input_tokens_seen': 16038245, 'train_runtime': '8112', 'train_tokens_per_second': '1977'} +{'loss': '0.9802', 'grad_norm': '1.525', 'learning_rate': '4.995e-05', 'epoch': '0.1973', 'num_input_tokens_seen': 16040292, 'train_runtime': '8113', 'train_tokens_per_second': '1977'} +{'loss': '0.2578', 'grad_norm': '0.8461', 'learning_rate': '4.995e-05', 'epoch': '0.1973', 'num_input_tokens_seen': 16042339, 'train_runtime': '8114', 'train_tokens_per_second': '1977'} +{'loss': '0.9204', 'grad_norm': '1.466', 'learning_rate': '4.995e-05', 'epoch': '0.1973', 'num_input_tokens_seen': 16044386, 'train_runtime': '8115', 'train_tokens_per_second': '1977'} +{'loss': '0.6172', 'grad_norm': '1.41', 'learning_rate': '4.995e-05', 'epoch': '0.1974', 'num_input_tokens_seen': 16046433, 'train_runtime': '8117', 'train_tokens_per_second': '1977'} +{'loss': '0.4345', 'grad_norm': '1.059', 'learning_rate': '4.995e-05', 'epoch': '0.1974', 'num_input_tokens_seen': 16048480, 'train_runtime': '8118', 'train_tokens_per_second': '1977'} +{'loss': '0.3652', 'grad_norm': '0.7981', 'learning_rate': '4.995e-05', 'epoch': '0.1974', 'num_input_tokens_seen': 16050527, 'train_runtime': '8119', 'train_tokens_per_second': '1977'} +{'loss': '0.681', 'grad_norm': '1.245', 'learning_rate': '4.995e-05', 'epoch': '0.1974', 'num_input_tokens_seen': 16052574, 'train_runtime': '8120', 'train_tokens_per_second': '1977'} +{'loss': '0.3267', 'grad_norm': '0.9842', 'learning_rate': '4.995e-05', 'epoch': '0.1975', 'num_input_tokens_seen': 16054621, 'train_runtime': '8121', 'train_tokens_per_second': '1977'} +{'loss': '0.2845', 'grad_norm': '0.872', 'learning_rate': '4.995e-05', 'epoch': '0.1975', 'num_input_tokens_seen': 16056668, 'train_runtime': '8122', 'train_tokens_per_second': '1977'} +{'loss': '0.5049', 'grad_norm': '1.235', 'learning_rate': '4.995e-05', 'epoch': '0.1975', 'num_input_tokens_seen': 16058715, 'train_runtime': '8123', 'train_tokens_per_second': '1977'} +{'loss': '0.3489', 'grad_norm': '1.185', 'learning_rate': '4.995e-05', 'epoch': '0.1975', 'num_input_tokens_seen': 16060762, 'train_runtime': '8124', 'train_tokens_per_second': '1977'} +{'loss': '0.3496', 'grad_norm': '1.03', 'learning_rate': '4.995e-05', 'epoch': '0.1976', 'num_input_tokens_seen': 16062809, 'train_runtime': '8125', 'train_tokens_per_second': '1977'} +{'loss': '0.7486', 'grad_norm': '1.641', 'learning_rate': '4.995e-05', 'epoch': '0.1976', 'num_input_tokens_seen': 16064856, 'train_runtime': '8126', 'train_tokens_per_second': '1977'} +{'loss': '0.3275', 'grad_norm': '1.101', 'learning_rate': '4.995e-05', 'epoch': '0.1976', 'num_input_tokens_seen': 16066903, 'train_runtime': '8127', 'train_tokens_per_second': '1977'} +{'loss': '0.3186', 'grad_norm': '1.239', 'learning_rate': '4.995e-05', 'epoch': '0.1976', 'num_input_tokens_seen': 16068950, 'train_runtime': '8128', 'train_tokens_per_second': '1977'} +{'loss': '0.7393', 'grad_norm': '1.603', 'learning_rate': '4.995e-05', 'epoch': '0.1977', 'num_input_tokens_seen': 16070997, 'train_runtime': '8129', 'train_tokens_per_second': '1977'} +{'loss': '0.8451', 'grad_norm': '1.103', 'learning_rate': '4.995e-05', 'epoch': '0.1977', 'num_input_tokens_seen': 16073044, 'train_runtime': '8130', 'train_tokens_per_second': '1977'} +{'loss': '0.7812', 'grad_norm': '1.243', 'learning_rate': '4.995e-05', 'epoch': '0.1977', 'num_input_tokens_seen': 16075091, 'train_runtime': '8131', 'train_tokens_per_second': '1977'} +{'loss': '0.2227', 'grad_norm': '0.8311', 'learning_rate': '4.995e-05', 'epoch': '0.1977', 'num_input_tokens_seen': 16077138, 'train_runtime': '8132', 'train_tokens_per_second': '1977'} +{'loss': '0.8794', 'grad_norm': '1.407', 'learning_rate': '4.995e-05', 'epoch': '0.1978', 'num_input_tokens_seen': 16079185, 'train_runtime': '8133', 'train_tokens_per_second': '1977'} +{'loss': '0.6777', 'grad_norm': '1.024', 'learning_rate': '4.995e-05', 'epoch': '0.1978', 'num_input_tokens_seen': 16081232, 'train_runtime': '8134', 'train_tokens_per_second': '1977'} +{'loss': '0.4218', 'grad_norm': '1.106', 'learning_rate': '4.995e-05', 'epoch': '0.1978', 'num_input_tokens_seen': 16083279, 'train_runtime': '8135', 'train_tokens_per_second': '1977'} +{'loss': '0.359', 'grad_norm': '0.8754', 'learning_rate': '4.995e-05', 'epoch': '0.1978', 'num_input_tokens_seen': 16085326, 'train_runtime': '8136', 'train_tokens_per_second': '1977'} +{'loss': '1.129', 'grad_norm': '2.007', 'learning_rate': '4.995e-05', 'epoch': '0.1979', 'num_input_tokens_seen': 16087373, 'train_runtime': '8137', 'train_tokens_per_second': '1977'} +{'loss': '0.8341', 'grad_norm': '1.662', 'learning_rate': '4.995e-05', 'epoch': '0.1979', 'num_input_tokens_seen': 16089420, 'train_runtime': '8138', 'train_tokens_per_second': '1977'} +{'loss': '1.494', 'grad_norm': '2.255', 'learning_rate': '4.995e-05', 'epoch': '0.1979', 'num_input_tokens_seen': 16091467, 'train_runtime': '8139', 'train_tokens_per_second': '1977'} +{'loss': '0.7', 'grad_norm': '1.352', 'learning_rate': '4.995e-05', 'epoch': '0.198', 'num_input_tokens_seen': 16093514, 'train_runtime': '8140', 'train_tokens_per_second': '1977'} +{'loss': '1.15', 'grad_norm': '1.612', 'learning_rate': '4.995e-05', 'epoch': '0.198', 'num_input_tokens_seen': 16095561, 'train_runtime': '8141', 'train_tokens_per_second': '1977'} +{'loss': '0.5801', 'grad_norm': '1.142', 'learning_rate': '4.995e-05', 'epoch': '0.198', 'num_input_tokens_seen': 16097608, 'train_runtime': '8142', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '1.65', 'learning_rate': '4.995e-05', 'epoch': '0.198', 'num_input_tokens_seen': 16099655, 'train_runtime': '8143', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.731', 'learning_rate': '4.995e-05', 'epoch': '0.1981', 'num_input_tokens_seen': 16101702, 'train_runtime': '8144', 'train_tokens_per_second': '1977'} +{'loss': '0.7829', 'grad_norm': '1.098', 'learning_rate': '4.995e-05', 'epoch': '0.1981', 'num_input_tokens_seen': 16103749, 'train_runtime': '8146', 'train_tokens_per_second': '1977'} +{'loss': '1.13', 'grad_norm': '1.834', 'learning_rate': '4.995e-05', 'epoch': '0.1981', 'num_input_tokens_seen': 16105796, 'train_runtime': '8147', 'train_tokens_per_second': '1977'} +{'loss': '0.8048', 'grad_norm': '1.415', 'learning_rate': '4.995e-05', 'epoch': '0.1981', 'num_input_tokens_seen': 16107843, 'train_runtime': '8148', 'train_tokens_per_second': '1977'} +{'loss': '0.4126', 'grad_norm': '1.028', 'learning_rate': '4.995e-05', 'epoch': '0.1982', 'num_input_tokens_seen': 16109890, 'train_runtime': '8149', 'train_tokens_per_second': '1977'} +{'loss': '0.4052', 'grad_norm': '1.031', 'learning_rate': '4.995e-05', 'epoch': '0.1982', 'num_input_tokens_seen': 16111937, 'train_runtime': '8150', 'train_tokens_per_second': '1977'} +{'loss': '1.495', 'grad_norm': '1.986', 'learning_rate': '4.995e-05', 'epoch': '0.1982', 'num_input_tokens_seen': 16113984, 'train_runtime': '8151', 'train_tokens_per_second': '1977'} +{'loss': '0.8233', 'grad_norm': '1.408', 'learning_rate': '4.995e-05', 'epoch': '0.1982', 'num_input_tokens_seen': 16116031, 'train_runtime': '8152', 'train_tokens_per_second': '1977'} +{'loss': '0.8291', 'grad_norm': '1.349', 'learning_rate': '4.995e-05', 'epoch': '0.1983', 'num_input_tokens_seen': 16118078, 'train_runtime': '8153', 'train_tokens_per_second': '1977'} +{'loss': '0.5717', 'grad_norm': '1.176', 'learning_rate': '4.995e-05', 'epoch': '0.1983', 'num_input_tokens_seen': 16120125, 'train_runtime': '8154', 'train_tokens_per_second': '1977'} +{'loss': '0.4249', 'grad_norm': '0.8829', 'learning_rate': '4.995e-05', 'epoch': '0.1983', 'num_input_tokens_seen': 16122172, 'train_runtime': '8155', 'train_tokens_per_second': '1977'} +{'loss': '0.9173', 'grad_norm': '1.528', 'learning_rate': '4.995e-05', 'epoch': '0.1983', 'num_input_tokens_seen': 16124219, 'train_runtime': '8156', 'train_tokens_per_second': '1977'} +{'loss': '2.006', 'grad_norm': '2.533', 'learning_rate': '4.995e-05', 'epoch': '0.1984', 'num_input_tokens_seen': 16126266, 'train_runtime': '8157', 'train_tokens_per_second': '1977'} +{'loss': '0.4687', 'grad_norm': '1.174', 'learning_rate': '4.995e-05', 'epoch': '0.1984', 'num_input_tokens_seen': 16128313, 'train_runtime': '8158', 'train_tokens_per_second': '1977'} +{'loss': '1.745', 'grad_norm': '2.165', 'learning_rate': '4.995e-05', 'epoch': '0.1984', 'num_input_tokens_seen': 16130360, 'train_runtime': '8159', 'train_tokens_per_second': '1977'} +{'loss': '0.6468', 'grad_norm': '1.465', 'learning_rate': '4.995e-05', 'epoch': '0.1984', 'num_input_tokens_seen': 16132407, 'train_runtime': '8160', 'train_tokens_per_second': '1977'} +{'loss': '0.8879', 'grad_norm': '1.664', 'learning_rate': '4.995e-05', 'epoch': '0.1985', 'num_input_tokens_seen': 16134454, 'train_runtime': '8161', 'train_tokens_per_second': '1977'} +{'loss': '0.636', 'grad_norm': '1.383', 'learning_rate': '4.995e-05', 'epoch': '0.1985', 'num_input_tokens_seen': 16136501, 'train_runtime': '8162', 'train_tokens_per_second': '1977'} +{'loss': '0.5089', 'grad_norm': '1.334', 'learning_rate': '4.995e-05', 'epoch': '0.1985', 'num_input_tokens_seen': 16138548, 'train_runtime': '8163', 'train_tokens_per_second': '1977'} +{'loss': '0.3818', 'grad_norm': '0.7781', 'learning_rate': '4.995e-05', 'epoch': '0.1985', 'num_input_tokens_seen': 16140595, 'train_runtime': '8164', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '1.473', 'learning_rate': '4.995e-05', 'epoch': '0.1986', 'num_input_tokens_seen': 16142642, 'train_runtime': '8165', 'train_tokens_per_second': '1977'} +{'loss': '1.231', 'grad_norm': '1.755', 'learning_rate': '4.995e-05', 'epoch': '0.1986', 'num_input_tokens_seen': 16144689, 'train_runtime': '8166', 'train_tokens_per_second': '1977'} +{'loss': '0.3792', 'grad_norm': '0.9676', 'learning_rate': '4.995e-05', 'epoch': '0.1986', 'num_input_tokens_seen': 16146736, 'train_runtime': '8167', 'train_tokens_per_second': '1977'} +{'loss': '0.4574', 'grad_norm': '1.128', 'learning_rate': '4.995e-05', 'epoch': '0.1986', 'num_input_tokens_seen': 16148783, 'train_runtime': '8168', 'train_tokens_per_second': '1977'} +{'loss': '0.7299', 'grad_norm': '1.53', 'learning_rate': '4.995e-05', 'epoch': '0.1987', 'num_input_tokens_seen': 16150830, 'train_runtime': '8169', 'train_tokens_per_second': '1977'} +{'loss': '0.3018', 'grad_norm': '0.9317', 'learning_rate': '4.995e-05', 'epoch': '0.1987', 'num_input_tokens_seen': 16152877, 'train_runtime': '8170', 'train_tokens_per_second': '1977'} +{'loss': '0.8074', 'grad_norm': '1.208', 'learning_rate': '4.995e-05', 'epoch': '0.1987', 'num_input_tokens_seen': 16154924, 'train_runtime': '8171', 'train_tokens_per_second': '1977'} +{'loss': '0.7475', 'grad_norm': '1.105', 'learning_rate': '4.995e-05', 'epoch': '0.1987', 'num_input_tokens_seen': 16156971, 'train_runtime': '8172', 'train_tokens_per_second': '1977'} +{'loss': '0.3803', 'grad_norm': '1.045', 'learning_rate': '4.995e-05', 'epoch': '0.1988', 'num_input_tokens_seen': 16159018, 'train_runtime': '8173', 'train_tokens_per_second': '1977'} +{'loss': '1.203', 'grad_norm': '2.041', 'learning_rate': '4.995e-05', 'epoch': '0.1988', 'num_input_tokens_seen': 16161065, 'train_runtime': '8175', 'train_tokens_per_second': '1977'} +{'loss': '0.4332', 'grad_norm': '1.019', 'learning_rate': '4.995e-05', 'epoch': '0.1988', 'num_input_tokens_seen': 16163112, 'train_runtime': '8176', 'train_tokens_per_second': '1977'} +{'loss': '0.4047', 'grad_norm': '0.914', 'learning_rate': '4.995e-05', 'epoch': '0.1988', 'num_input_tokens_seen': 16165159, 'train_runtime': '8177', 'train_tokens_per_second': '1977'} +{'loss': '0.986', 'grad_norm': '1.762', 'learning_rate': '4.995e-05', 'epoch': '0.1989', 'num_input_tokens_seen': 16167206, 'train_runtime': '8178', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '1.983', 'learning_rate': '4.995e-05', 'epoch': '0.1989', 'num_input_tokens_seen': 16169253, 'train_runtime': '8179', 'train_tokens_per_second': '1977'} +{'loss': '1.063', 'grad_norm': '1.71', 'learning_rate': '4.995e-05', 'epoch': '0.1989', 'num_input_tokens_seen': 16171300, 'train_runtime': '8180', 'train_tokens_per_second': '1977'} +{'loss': '0.5634', 'grad_norm': '1.422', 'learning_rate': '4.995e-05', 'epoch': '0.1989', 'num_input_tokens_seen': 16173347, 'train_runtime': '8181', 'train_tokens_per_second': '1977'} +{'loss': '0.3306', 'grad_norm': '1.119', 'learning_rate': '4.995e-05', 'epoch': '0.199', 'num_input_tokens_seen': 16175394, 'train_runtime': '8182', 'train_tokens_per_second': '1977'} +{'loss': '1.206', 'grad_norm': '1.807', 'learning_rate': '4.995e-05', 'epoch': '0.199', 'num_input_tokens_seen': 16177441, 'train_runtime': '8183', 'train_tokens_per_second': '1977'} +{'loss': '0.8157', 'grad_norm': '1.474', 'learning_rate': '4.995e-05', 'epoch': '0.199', 'num_input_tokens_seen': 16179488, 'train_runtime': '8184', 'train_tokens_per_second': '1977'} +{'loss': '0.3244', 'grad_norm': '0.8977', 'learning_rate': '4.995e-05', 'epoch': '0.199', 'num_input_tokens_seen': 16181535, 'train_runtime': '8185', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '1.435', 'learning_rate': '4.995e-05', 'epoch': '0.1991', 'num_input_tokens_seen': 16183582, 'train_runtime': '8186', 'train_tokens_per_second': '1977'} +{'loss': '0.7206', 'grad_norm': '1.446', 'learning_rate': '4.995e-05', 'epoch': '0.1991', 'num_input_tokens_seen': 16185629, 'train_runtime': '8187', 'train_tokens_per_second': '1977'} +{'loss': '0.4607', 'grad_norm': '0.8222', 'learning_rate': '4.995e-05', 'epoch': '0.1991', 'num_input_tokens_seen': 16187676, 'train_runtime': '8188', 'train_tokens_per_second': '1977'} +{'loss': '1.353', 'grad_norm': '1.963', 'learning_rate': '4.995e-05', 'epoch': '0.1991', 'num_input_tokens_seen': 16189723, 'train_runtime': '8189', 'train_tokens_per_second': '1977'} +{'loss': '0.5237', 'grad_norm': '1.083', 'learning_rate': '4.995e-05', 'epoch': '0.1992', 'num_input_tokens_seen': 16191770, 'train_runtime': '8190', 'train_tokens_per_second': '1977'} +{'loss': '0.7715', 'grad_norm': '1.295', 'learning_rate': '4.995e-05', 'epoch': '0.1992', 'num_input_tokens_seen': 16193817, 'train_runtime': '8191', 'train_tokens_per_second': '1977'} +{'loss': '0.4259', 'grad_norm': '1.077', 'learning_rate': '4.995e-05', 'epoch': '0.1992', 'num_input_tokens_seen': 16195864, 'train_runtime': '8192', 'train_tokens_per_second': '1977'} +{'loss': '0.56', 'grad_norm': '1.176', 'learning_rate': '4.995e-05', 'epoch': '0.1992', 'num_input_tokens_seen': 16197911, 'train_runtime': '8193', 'train_tokens_per_second': '1977'} +{'loss': '1.126', 'grad_norm': '1.738', 'learning_rate': '4.995e-05', 'epoch': '0.1993', 'num_input_tokens_seen': 16199958, 'train_runtime': '8194', 'train_tokens_per_second': '1977'} +{'loss': '0.481', 'grad_norm': '0.8808', 'learning_rate': '4.995e-05', 'epoch': '0.1993', 'num_input_tokens_seen': 16202005, 'train_runtime': '8195', 'train_tokens_per_second': '1977'} +{'loss': '0.5475', 'grad_norm': '1.045', 'learning_rate': '4.995e-05', 'epoch': '0.1993', 'num_input_tokens_seen': 16204052, 'train_runtime': '8196', 'train_tokens_per_second': '1977'} +{'loss': '0.3851', 'grad_norm': '0.9443', 'learning_rate': '4.995e-05', 'epoch': '0.1993', 'num_input_tokens_seen': 16206099, 'train_runtime': '8197', 'train_tokens_per_second': '1977'} +{'loss': '0.5195', 'grad_norm': '1.047', 'learning_rate': '4.995e-05', 'epoch': '0.1994', 'num_input_tokens_seen': 16208146, 'train_runtime': '8198', 'train_tokens_per_second': '1977'} +{'loss': '0.3999', 'grad_norm': '0.9332', 'learning_rate': '4.995e-05', 'epoch': '0.1994', 'num_input_tokens_seen': 16210193, 'train_runtime': '8199', 'train_tokens_per_second': '1977'} +{'loss': '0.3851', 'grad_norm': '1.028', 'learning_rate': '4.995e-05', 'epoch': '0.1994', 'num_input_tokens_seen': 16212240, 'train_runtime': '8200', 'train_tokens_per_second': '1977'} +{'loss': '0.8326', 'grad_norm': '1.439', 'learning_rate': '4.995e-05', 'epoch': '0.1994', 'num_input_tokens_seen': 16214287, 'train_runtime': '8201', 'train_tokens_per_second': '1977'} +{'loss': '2.043', 'grad_norm': '2.447', 'learning_rate': '4.995e-05', 'epoch': '0.1995', 'num_input_tokens_seen': 16216334, 'train_runtime': '8202', 'train_tokens_per_second': '1977'} +{'loss': '0.6238', 'grad_norm': '1.481', 'learning_rate': '4.995e-05', 'epoch': '0.1995', 'num_input_tokens_seen': 16218381, 'train_runtime': '8203', 'train_tokens_per_second': '1977'} +{'loss': '0.6637', 'grad_norm': '1.309', 'learning_rate': '4.995e-05', 'epoch': '0.1995', 'num_input_tokens_seen': 16220428, 'train_runtime': '8205', 'train_tokens_per_second': '1977'} +{'loss': '0.2648', 'grad_norm': '0.8636', 'learning_rate': '4.995e-05', 'epoch': '0.1995', 'num_input_tokens_seen': 16222475, 'train_runtime': '8206', 'train_tokens_per_second': '1977'} +{'loss': '0.8074', 'grad_norm': '1.22', 'learning_rate': '4.995e-05', 'epoch': '0.1996', 'num_input_tokens_seen': 16224522, 'train_runtime': '8207', 'train_tokens_per_second': '1977'} +{'loss': '0.5416', 'grad_norm': '1.353', 'learning_rate': '4.995e-05', 'epoch': '0.1996', 'num_input_tokens_seen': 16226569, 'train_runtime': '8208', 'train_tokens_per_second': '1977'} +{'loss': '0.3345', 'grad_norm': '0.9055', 'learning_rate': '4.995e-05', 'epoch': '0.1996', 'num_input_tokens_seen': 16228616, 'train_runtime': '8209', 'train_tokens_per_second': '1977'} +{'loss': '0.6693', 'grad_norm': '1.34', 'learning_rate': '4.995e-05', 'epoch': '0.1996', 'num_input_tokens_seen': 16230663, 'train_runtime': '8210', 'train_tokens_per_second': '1977'} +{'loss': '1.943', 'grad_norm': '2.777', 'learning_rate': '4.995e-05', 'epoch': '0.1997', 'num_input_tokens_seen': 16232710, 'train_runtime': '8211', 'train_tokens_per_second': '1977'} +{'loss': '0.8025', 'grad_norm': '1.245', 'learning_rate': '4.995e-05', 'epoch': '0.1997', 'num_input_tokens_seen': 16234757, 'train_runtime': '8212', 'train_tokens_per_second': '1977'} +{'loss': '2.157', 'grad_norm': '2.069', 'learning_rate': '4.995e-05', 'epoch': '0.1997', 'num_input_tokens_seen': 16236804, 'train_runtime': '8213', 'train_tokens_per_second': '1977'} +{'loss': '0.4099', 'grad_norm': '0.9303', 'learning_rate': '4.995e-05', 'epoch': '0.1997', 'num_input_tokens_seen': 16238851, 'train_runtime': '8214', 'train_tokens_per_second': '1977'} +{'loss': '0.6105', 'grad_norm': '1.147', 'learning_rate': '4.995e-05', 'epoch': '0.1998', 'num_input_tokens_seen': 16240898, 'train_runtime': '8215', 'train_tokens_per_second': '1977'} +{'loss': '0.5967', 'grad_norm': '1.232', 'learning_rate': '4.995e-05', 'epoch': '0.1998', 'num_input_tokens_seen': 16242945, 'train_runtime': '8216', 'train_tokens_per_second': '1977'} +{'loss': '0.4635', 'grad_norm': '1.042', 'learning_rate': '4.995e-05', 'epoch': '0.1998', 'num_input_tokens_seen': 16244992, 'train_runtime': '8217', 'train_tokens_per_second': '1977'} +{'loss': '0.63', 'grad_norm': '1.492', 'learning_rate': '4.995e-05', 'epoch': '0.1998', 'num_input_tokens_seen': 16247039, 'train_runtime': '8218', 'train_tokens_per_second': '1977'} +{'loss': '1.228', 'grad_norm': '1.769', 'learning_rate': '4.995e-05', 'epoch': '0.1999', 'num_input_tokens_seen': 16249086, 'train_runtime': '8219', 'train_tokens_per_second': '1977'} +{'loss': '0.7705', 'grad_norm': '1.276', 'learning_rate': '4.995e-05', 'epoch': '0.1999', 'num_input_tokens_seen': 16251133, 'train_runtime': '8220', 'train_tokens_per_second': '1977'} +{'loss': '1.187', 'grad_norm': '1.588', 'learning_rate': '4.995e-05', 'epoch': '0.1999', 'num_input_tokens_seen': 16253180, 'train_runtime': '8221', 'train_tokens_per_second': '1977'} +{'loss': '0.7482', 'grad_norm': '1.407', 'learning_rate': '4.995e-05', 'epoch': '0.1999', 'num_input_tokens_seen': 16255227, 'train_runtime': '8222', 'train_tokens_per_second': '1977'} +{'loss': '0.6895', 'grad_norm': '1.23', 'learning_rate': '4.995e-05', 'epoch': '0.2', 'num_input_tokens_seen': 16257274, 'train_runtime': '8223', 'train_tokens_per_second': '1977'} +{'loss': '0.878', 'grad_norm': '1.486', 'learning_rate': '4.995e-05', 'epoch': '0.2', 'num_input_tokens_seen': 16259321, 'train_runtime': '8224', 'train_tokens_per_second': '1977'} +{'loss': '0.5236', 'grad_norm': '1.449', 'learning_rate': '4.995e-05', 'epoch': '0.2', 'num_input_tokens_seen': 16261368, 'train_runtime': '8225', 'train_tokens_per_second': '1977'} +{'loss': '0.9598', 'grad_norm': '1.19', 'learning_rate': '4.995e-05', 'epoch': '0.2', 'num_input_tokens_seen': 16263415, 'train_runtime': '8226', 'train_tokens_per_second': '1977'} +{'loss': '0.652', 'grad_norm': '1.045', 'learning_rate': '4.995e-05', 'epoch': '0.2001', 'num_input_tokens_seen': 16265462, 'train_runtime': '8227', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.225', 'learning_rate': '4.995e-05', 'epoch': '0.2001', 'num_input_tokens_seen': 16267509, 'train_runtime': '8228', 'train_tokens_per_second': '1977'} +{'loss': '0.9344', 'grad_norm': '1.492', 'learning_rate': '4.995e-05', 'epoch': '0.2001', 'num_input_tokens_seen': 16269556, 'train_runtime': '8229', 'train_tokens_per_second': '1977'} +{'loss': '0.8774', 'grad_norm': '1.403', 'learning_rate': '4.995e-05', 'epoch': '0.2001', 'num_input_tokens_seen': 16271603, 'train_runtime': '8230', 'train_tokens_per_second': '1977'} +{'loss': '1.362', 'grad_norm': '2.106', 'learning_rate': '4.995e-05', 'epoch': '0.2002', 'num_input_tokens_seen': 16273650, 'train_runtime': '8231', 'train_tokens_per_second': '1977'} +{'loss': '1.281', 'grad_norm': '1.618', 'learning_rate': '4.995e-05', 'epoch': '0.2002', 'num_input_tokens_seen': 16275697, 'train_runtime': '8233', 'train_tokens_per_second': '1977'} +{'loss': '0.9441', 'grad_norm': '1.713', 'learning_rate': '4.995e-05', 'epoch': '0.2002', 'num_input_tokens_seen': 16277744, 'train_runtime': '8234', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '1.988', 'learning_rate': '4.995e-05', 'epoch': '0.2002', 'num_input_tokens_seen': 16279791, 'train_runtime': '8235', 'train_tokens_per_second': '1977'} +{'loss': '2.24', 'grad_norm': '2.436', 'learning_rate': '4.995e-05', 'epoch': '0.2003', 'num_input_tokens_seen': 16281838, 'train_runtime': '8236', 'train_tokens_per_second': '1977'} +{'loss': '0.3562', 'grad_norm': '0.9487', 'learning_rate': '4.995e-05', 'epoch': '0.2003', 'num_input_tokens_seen': 16283885, 'train_runtime': '8237', 'train_tokens_per_second': '1977'} +{'loss': '1.068', 'grad_norm': '1.948', 'learning_rate': '4.995e-05', 'epoch': '0.2003', 'num_input_tokens_seen': 16285932, 'train_runtime': '8238', 'train_tokens_per_second': '1977'} +{'loss': '1.012', 'grad_norm': '1.979', 'learning_rate': '4.995e-05', 'epoch': '0.2003', 'num_input_tokens_seen': 16287979, 'train_runtime': '8239', 'train_tokens_per_second': '1977'} +{'loss': '0.4811', 'grad_norm': '1.011', 'learning_rate': '4.995e-05', 'epoch': '0.2004', 'num_input_tokens_seen': 16290026, 'train_runtime': '8240', 'train_tokens_per_second': '1977'} +{'loss': '0.6283', 'grad_norm': '0.9987', 'learning_rate': '4.995e-05', 'epoch': '0.2004', 'num_input_tokens_seen': 16292073, 'train_runtime': '8241', 'train_tokens_per_second': '1977'} +{'loss': '0.244', 'grad_norm': '0.8403', 'learning_rate': '4.995e-05', 'epoch': '0.2004', 'num_input_tokens_seen': 16294120, 'train_runtime': '8242', 'train_tokens_per_second': '1977'} +{'loss': '0.4241', 'grad_norm': '0.9753', 'learning_rate': '4.995e-05', 'epoch': '0.2004', 'num_input_tokens_seen': 16296167, 'train_runtime': '8243', 'train_tokens_per_second': '1977'} +{'loss': '0.7081', 'grad_norm': '1.417', 'learning_rate': '4.995e-05', 'epoch': '0.2005', 'num_input_tokens_seen': 16298214, 'train_runtime': '8244', 'train_tokens_per_second': '1977'} +{'loss': '1.152', 'grad_norm': '1.911', 'learning_rate': '4.995e-05', 'epoch': '0.2005', 'num_input_tokens_seen': 16300261, 'train_runtime': '8245', 'train_tokens_per_second': '1977'} +{'loss': '1.395', 'grad_norm': '1.629', 'learning_rate': '4.995e-05', 'epoch': '0.2005', 'num_input_tokens_seen': 16302308, 'train_runtime': '8246', 'train_tokens_per_second': '1977'} +{'loss': '0.2871', 'grad_norm': '0.838', 'learning_rate': '4.995e-05', 'epoch': '0.2005', 'num_input_tokens_seen': 16304355, 'train_runtime': '8247', 'train_tokens_per_second': '1977'} +{'loss': '0.7724', 'grad_norm': '1.467', 'learning_rate': '4.995e-05', 'epoch': '0.2006', 'num_input_tokens_seen': 16306402, 'train_runtime': '8248', 'train_tokens_per_second': '1977'} +{'loss': '0.8769', 'grad_norm': '1.262', 'learning_rate': '4.995e-05', 'epoch': '0.2006', 'num_input_tokens_seen': 16308449, 'train_runtime': '8249', 'train_tokens_per_second': '1977'} +{'loss': '0.9474', 'grad_norm': '1.777', 'learning_rate': '4.995e-05', 'epoch': '0.2006', 'num_input_tokens_seen': 16310496, 'train_runtime': '8250', 'train_tokens_per_second': '1977'} +{'loss': '0.3591', 'grad_norm': '0.9905', 'learning_rate': '4.995e-05', 'epoch': '0.2006', 'num_input_tokens_seen': 16312543, 'train_runtime': '8251', 'train_tokens_per_second': '1977'} +{'loss': '0.8256', 'grad_norm': '1.287', 'learning_rate': '4.995e-05', 'epoch': '0.2007', 'num_input_tokens_seen': 16314590, 'train_runtime': '8252', 'train_tokens_per_second': '1977'} +{'loss': '0.4857', 'grad_norm': '1.09', 'learning_rate': '4.995e-05', 'epoch': '0.2007', 'num_input_tokens_seen': 16316637, 'train_runtime': '8253', 'train_tokens_per_second': '1977'} +{'loss': '0.393', 'grad_norm': '1.01', 'learning_rate': '4.995e-05', 'epoch': '0.2007', 'num_input_tokens_seen': 16318684, 'train_runtime': '8254', 'train_tokens_per_second': '1977'} +{'loss': '1.042', 'grad_norm': '1.514', 'learning_rate': '4.995e-05', 'epoch': '0.2007', 'num_input_tokens_seen': 16320731, 'train_runtime': '8255', 'train_tokens_per_second': '1977'} +{'loss': '1.36', 'grad_norm': '2.421', 'learning_rate': '4.995e-05', 'epoch': '0.2008', 'num_input_tokens_seen': 16322778, 'train_runtime': '8256', 'train_tokens_per_second': '1977'} +{'loss': '0.7136', 'grad_norm': '1.573', 'learning_rate': '4.995e-05', 'epoch': '0.2008', 'num_input_tokens_seen': 16324825, 'train_runtime': '8257', 'train_tokens_per_second': '1977'} +{'loss': '0.4912', 'grad_norm': '1.117', 'learning_rate': '4.995e-05', 'epoch': '0.2008', 'num_input_tokens_seen': 16326872, 'train_runtime': '8258', 'train_tokens_per_second': '1977'} +{'loss': '0.7773', 'grad_norm': '1.202', 'learning_rate': '4.995e-05', 'epoch': '0.2008', 'num_input_tokens_seen': 16328919, 'train_runtime': '8259', 'train_tokens_per_second': '1977'} +{'loss': '0.6293', 'grad_norm': '1.167', 'learning_rate': '4.995e-05', 'epoch': '0.2009', 'num_input_tokens_seen': 16330966, 'train_runtime': '8260', 'train_tokens_per_second': '1977'} +{'loss': '0.8468', 'grad_norm': '1.541', 'learning_rate': '4.995e-05', 'epoch': '0.2009', 'num_input_tokens_seen': 16333013, 'train_runtime': '8262', 'train_tokens_per_second': '1977'} +{'loss': '0.3306', 'grad_norm': '0.8005', 'learning_rate': '4.995e-05', 'epoch': '0.2009', 'num_input_tokens_seen': 16335060, 'train_runtime': '8263', 'train_tokens_per_second': '1977'} +{'loss': '0.4095', 'grad_norm': '1.117', 'learning_rate': '4.995e-05', 'epoch': '0.2009', 'num_input_tokens_seen': 16337107, 'train_runtime': '8264', 'train_tokens_per_second': '1977'} +{'loss': '0.6471', 'grad_norm': '1.327', 'learning_rate': '4.995e-05', 'epoch': '0.201', 'num_input_tokens_seen': 16339154, 'train_runtime': '8265', 'train_tokens_per_second': '1977'} +{'loss': '0.9528', 'grad_norm': '1.189', 'learning_rate': '4.995e-05', 'epoch': '0.201', 'num_input_tokens_seen': 16341201, 'train_runtime': '8266', 'train_tokens_per_second': '1977'} +{'loss': '0.494', 'grad_norm': '1.151', 'learning_rate': '4.995e-05', 'epoch': '0.201', 'num_input_tokens_seen': 16343248, 'train_runtime': '8267', 'train_tokens_per_second': '1977'} +{'loss': '0.5298', 'grad_norm': '1.291', 'learning_rate': '4.995e-05', 'epoch': '0.201', 'num_input_tokens_seen': 16345295, 'train_runtime': '8268', 'train_tokens_per_second': '1977'} +{'loss': '1.522', 'grad_norm': '2.239', 'learning_rate': '4.995e-05', 'epoch': '0.2011', 'num_input_tokens_seen': 16347342, 'train_runtime': '8269', 'train_tokens_per_second': '1977'} +{'loss': '0.3886', 'grad_norm': '0.8586', 'learning_rate': '4.995e-05', 'epoch': '0.2011', 'num_input_tokens_seen': 16349389, 'train_runtime': '8270', 'train_tokens_per_second': '1977'} +{'loss': '1.309', 'grad_norm': '1.839', 'learning_rate': '4.995e-05', 'epoch': '0.2011', 'num_input_tokens_seen': 16351436, 'train_runtime': '8271', 'train_tokens_per_second': '1977'} +{'loss': '0.5191', 'grad_norm': '1.014', 'learning_rate': '4.995e-05', 'epoch': '0.2011', 'num_input_tokens_seen': 16353483, 'train_runtime': '8272', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '2.123', 'learning_rate': '4.995e-05', 'epoch': '0.2012', 'num_input_tokens_seen': 16355530, 'train_runtime': '8273', 'train_tokens_per_second': '1977'} +{'loss': '0.7021', 'grad_norm': '1.392', 'learning_rate': '4.995e-05', 'epoch': '0.2012', 'num_input_tokens_seen': 16357577, 'train_runtime': '8274', 'train_tokens_per_second': '1977'} +{'loss': '0.3571', 'grad_norm': '1.044', 'learning_rate': '4.995e-05', 'epoch': '0.2012', 'num_input_tokens_seen': 16359624, 'train_runtime': '8275', 'train_tokens_per_second': '1977'} +{'loss': '0.398', 'grad_norm': '0.8924', 'learning_rate': '4.995e-05', 'epoch': '0.2012', 'num_input_tokens_seen': 16361671, 'train_runtime': '8276', 'train_tokens_per_second': '1977'} +{'loss': '0.5082', 'grad_norm': '1.194', 'learning_rate': '4.995e-05', 'epoch': '0.2013', 'num_input_tokens_seen': 16363718, 'train_runtime': '8277', 'train_tokens_per_second': '1977'} +{'loss': '0.8079', 'grad_norm': '1.282', 'learning_rate': '4.995e-05', 'epoch': '0.2013', 'num_input_tokens_seen': 16365765, 'train_runtime': '8278', 'train_tokens_per_second': '1977'} +{'loss': '0.3549', 'grad_norm': '1.114', 'learning_rate': '4.995e-05', 'epoch': '0.2013', 'num_input_tokens_seen': 16367812, 'train_runtime': '8279', 'train_tokens_per_second': '1977'} +{'loss': '1.753', 'grad_norm': '2.486', 'learning_rate': '4.995e-05', 'epoch': '0.2013', 'num_input_tokens_seen': 16369859, 'train_runtime': '8280', 'train_tokens_per_second': '1977'} +{'loss': '0.7464', 'grad_norm': '1.392', 'learning_rate': '4.995e-05', 'epoch': '0.2014', 'num_input_tokens_seen': 16371906, 'train_runtime': '8281', 'train_tokens_per_second': '1977'} +{'loss': '0.5711', 'grad_norm': '1.256', 'learning_rate': '4.995e-05', 'epoch': '0.2014', 'num_input_tokens_seen': 16373953, 'train_runtime': '8282', 'train_tokens_per_second': '1977'} +{'loss': '0.3027', 'grad_norm': '0.9501', 'learning_rate': '4.995e-05', 'epoch': '0.2014', 'num_input_tokens_seen': 16376000, 'train_runtime': '8283', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 04:55:28,792 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 04:55:28,792 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 04:55:29,296 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-8000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 04:55:29,311 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-8000/tokenizer_config.json + +{'loss': '0.2603', 'grad_norm': '0.8616', 'learning_rate': '4.995e-05', 'epoch': '0.2015', 'num_input_tokens_seen': 16378047, 'train_runtime': '8285', 'train_tokens_per_second': '1977'} +{'loss': '0.3884', 'grad_norm': '0.9293', 'learning_rate': '4.995e-05', 'epoch': '0.2015', 'num_input_tokens_seen': 16380094, 'train_runtime': '8286', 'train_tokens_per_second': '1977'} +{'loss': '0.6249', 'grad_norm': '1.406', 'learning_rate': '4.995e-05', 'epoch': '0.2015', 'num_input_tokens_seen': 16382141, 'train_runtime': '8287', 'train_tokens_per_second': '1977'} +{'loss': '0.3776', 'grad_norm': '1.039', 'learning_rate': '4.995e-05', 'epoch': '0.2015', 'num_input_tokens_seen': 16384188, 'train_runtime': '8288', 'train_tokens_per_second': '1977'} +{'loss': '0.6575', 'grad_norm': '1.159', 'learning_rate': '4.995e-05', 'epoch': '0.2016', 'num_input_tokens_seen': 16386235, 'train_runtime': '8289', 'train_tokens_per_second': '1977'} +{'loss': '0.5126', 'grad_norm': '1.337', 'learning_rate': '4.995e-05', 'epoch': '0.2016', 'num_input_tokens_seen': 16388282, 'train_runtime': '8290', 'train_tokens_per_second': '1977'} +{'loss': '0.8816', 'grad_norm': '1.713', 'learning_rate': '4.995e-05', 'epoch': '0.2016', 'num_input_tokens_seen': 16390329, 'train_runtime': '8291', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '1.727', 'learning_rate': '4.995e-05', 'epoch': '0.2016', 'num_input_tokens_seen': 16392376, 'train_runtime': '8292', 'train_tokens_per_second': '1977'} +{'loss': '0.4505', 'grad_norm': '1.256', 'learning_rate': '4.995e-05', 'epoch': '0.2017', 'num_input_tokens_seen': 16394423, 'train_runtime': '8293', 'train_tokens_per_second': '1977'} +{'loss': '0.5852', 'grad_norm': '1.336', 'learning_rate': '4.995e-05', 'epoch': '0.2017', 'num_input_tokens_seen': 16396470, 'train_runtime': '8294', 'train_tokens_per_second': '1977'} +{'loss': '1.324', 'grad_norm': '2.129', 'learning_rate': '4.995e-05', 'epoch': '0.2017', 'num_input_tokens_seen': 16398517, 'train_runtime': '8295', 'train_tokens_per_second': '1977'} +{'loss': '1.128', 'grad_norm': '1.555', 'learning_rate': '4.995e-05', 'epoch': '0.2017', 'num_input_tokens_seen': 16400564, 'train_runtime': '8296', 'train_tokens_per_second': '1977'} +{'loss': '0.5447', 'grad_norm': '1.293', 'learning_rate': '4.995e-05', 'epoch': '0.2018', 'num_input_tokens_seen': 16402611, 'train_runtime': '8298', 'train_tokens_per_second': '1977'} +{'loss': '0.4067', 'grad_norm': '0.9203', 'learning_rate': '4.995e-05', 'epoch': '0.2018', 'num_input_tokens_seen': 16404658, 'train_runtime': '8299', 'train_tokens_per_second': '1977'} +{'loss': '0.7002', 'grad_norm': '0.9687', 'learning_rate': '4.995e-05', 'epoch': '0.2018', 'num_input_tokens_seen': 16406705, 'train_runtime': '8300', 'train_tokens_per_second': '1977'} +{'loss': '0.7354', 'grad_norm': '1.396', 'learning_rate': '4.995e-05', 'epoch': '0.2018', 'num_input_tokens_seen': 16408752, 'train_runtime': '8301', 'train_tokens_per_second': '1977'} +{'loss': '1.045', 'grad_norm': '1.86', 'learning_rate': '4.995e-05', 'epoch': '0.2019', 'num_input_tokens_seen': 16410799, 'train_runtime': '8302', 'train_tokens_per_second': '1977'} +{'loss': '0.7064', 'grad_norm': '1.282', 'learning_rate': '4.995e-05', 'epoch': '0.2019', 'num_input_tokens_seen': 16412846, 'train_runtime': '8303', 'train_tokens_per_second': '1977'} +{'loss': '0.1762', 'grad_norm': '0.7187', 'learning_rate': '4.995e-05', 'epoch': '0.2019', 'num_input_tokens_seen': 16414893, 'train_runtime': '8304', 'train_tokens_per_second': '1977'} +{'loss': '0.6676', 'grad_norm': '1.448', 'learning_rate': '4.995e-05', 'epoch': '0.2019', 'num_input_tokens_seen': 16416940, 'train_runtime': '8305', 'train_tokens_per_second': '1977'} +{'loss': '1.024', 'grad_norm': '1.966', 'learning_rate': '4.995e-05', 'epoch': '0.202', 'num_input_tokens_seen': 16418987, 'train_runtime': '8306', 'train_tokens_per_second': '1977'} +{'loss': '1.412', 'grad_norm': '2.11', 'learning_rate': '4.995e-05', 'epoch': '0.202', 'num_input_tokens_seen': 16421034, 'train_runtime': '8307', 'train_tokens_per_second': '1977'} +{'loss': '0.3965', 'grad_norm': '0.946', 'learning_rate': '4.995e-05', 'epoch': '0.202', 'num_input_tokens_seen': 16423081, 'train_runtime': '8308', 'train_tokens_per_second': '1977'} +{'loss': '0.6483', 'grad_norm': '1.447', 'learning_rate': '4.995e-05', 'epoch': '0.202', 'num_input_tokens_seen': 16425128, 'train_runtime': '8309', 'train_tokens_per_second': '1977'} +{'loss': '0.716', 'grad_norm': '1.356', 'learning_rate': '4.995e-05', 'epoch': '0.2021', 'num_input_tokens_seen': 16427175, 'train_runtime': '8310', 'train_tokens_per_second': '1977'} +{'loss': '2.246', 'grad_norm': '2.497', 'learning_rate': '4.995e-05', 'epoch': '0.2021', 'num_input_tokens_seen': 16429222, 'train_runtime': '8311', 'train_tokens_per_second': '1977'} +{'loss': '0.5053', 'grad_norm': '1.137', 'learning_rate': '4.995e-05', 'epoch': '0.2021', 'num_input_tokens_seen': 16431269, 'train_runtime': '8312', 'train_tokens_per_second': '1977'} +{'loss': '1.493', 'grad_norm': '2.005', 'learning_rate': '4.995e-05', 'epoch': '0.2021', 'num_input_tokens_seen': 16433316, 'train_runtime': '8313', 'train_tokens_per_second': '1977'} +{'loss': '0.837', 'grad_norm': '1.25', 'learning_rate': '4.995e-05', 'epoch': '0.2022', 'num_input_tokens_seen': 16435363, 'train_runtime': '8314', 'train_tokens_per_second': '1977'} +{'loss': '0.3056', 'grad_norm': '0.9881', 'learning_rate': '4.995e-05', 'epoch': '0.2022', 'num_input_tokens_seen': 16437410, 'train_runtime': '8315', 'train_tokens_per_second': '1977'} +{'loss': '0.644', 'grad_norm': '1.24', 'learning_rate': '4.995e-05', 'epoch': '0.2022', 'num_input_tokens_seen': 16439457, 'train_runtime': '8316', 'train_tokens_per_second': '1977'} +{'loss': '0.6097', 'grad_norm': '1.212', 'learning_rate': '4.995e-05', 'epoch': '0.2022', 'num_input_tokens_seen': 16441504, 'train_runtime': '8317', 'train_tokens_per_second': '1977'} +{'loss': '0.3088', 'grad_norm': '0.8557', 'learning_rate': '4.995e-05', 'epoch': '0.2023', 'num_input_tokens_seen': 16443551, 'train_runtime': '8318', 'train_tokens_per_second': '1977'} +{'loss': '0.4449', 'grad_norm': '1.019', 'learning_rate': '4.995e-05', 'epoch': '0.2023', 'num_input_tokens_seen': 16445598, 'train_runtime': '8319', 'train_tokens_per_second': '1977'} +{'loss': '1.066', 'grad_norm': '1.727', 'learning_rate': '4.995e-05', 'epoch': '0.2023', 'num_input_tokens_seen': 16447645, 'train_runtime': '8320', 'train_tokens_per_second': '1977'} +{'loss': '1.074', 'grad_norm': '1.659', 'learning_rate': '4.995e-05', 'epoch': '0.2023', 'num_input_tokens_seen': 16449692, 'train_runtime': '8321', 'train_tokens_per_second': '1977'} +{'loss': '0.2484', 'grad_norm': '0.9938', 'learning_rate': '4.995e-05', 'epoch': '0.2024', 'num_input_tokens_seen': 16451739, 'train_runtime': '8322', 'train_tokens_per_second': '1977'} +{'loss': '0.6056', 'grad_norm': '1.372', 'learning_rate': '4.995e-05', 'epoch': '0.2024', 'num_input_tokens_seen': 16453786, 'train_runtime': '8323', 'train_tokens_per_second': '1977'} +{'loss': '0.2561', 'grad_norm': '0.8473', 'learning_rate': '4.995e-05', 'epoch': '0.2024', 'num_input_tokens_seen': 16455833, 'train_runtime': '8324', 'train_tokens_per_second': '1977'} +{'loss': '0.6973', 'grad_norm': '1.162', 'learning_rate': '4.995e-05', 'epoch': '0.2024', 'num_input_tokens_seen': 16457880, 'train_runtime': '8325', 'train_tokens_per_second': '1977'} +{'loss': '0.5838', 'grad_norm': '1.148', 'learning_rate': '4.995e-05', 'epoch': '0.2025', 'num_input_tokens_seen': 16459927, 'train_runtime': '8327', 'train_tokens_per_second': '1977'} +{'loss': '0.8476', 'grad_norm': '1.455', 'learning_rate': '4.995e-05', 'epoch': '0.2025', 'num_input_tokens_seen': 16461974, 'train_runtime': '8328', 'train_tokens_per_second': '1977'} +{'loss': '1.901', 'grad_norm': '2.505', 'learning_rate': '4.995e-05', 'epoch': '0.2025', 'num_input_tokens_seen': 16464021, 'train_runtime': '8329', 'train_tokens_per_second': '1977'} +{'loss': '0.3443', 'grad_norm': '0.9624', 'learning_rate': '4.995e-05', 'epoch': '0.2025', 'num_input_tokens_seen': 16466068, 'train_runtime': '8330', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '2.14', 'learning_rate': '4.995e-05', 'epoch': '0.2026', 'num_input_tokens_seen': 16468115, 'train_runtime': '8331', 'train_tokens_per_second': '1977'} +{'loss': '0.4043', 'grad_norm': '0.9413', 'learning_rate': '4.995e-05', 'epoch': '0.2026', 'num_input_tokens_seen': 16470162, 'train_runtime': '8332', 'train_tokens_per_second': '1977'} +{'loss': '0.7454', 'grad_norm': '1.305', 'learning_rate': '4.995e-05', 'epoch': '0.2026', 'num_input_tokens_seen': 16472209, 'train_runtime': '8333', 'train_tokens_per_second': '1977'} +{'loss': '1.139', 'grad_norm': '1.819', 'learning_rate': '4.995e-05', 'epoch': '0.2026', 'num_input_tokens_seen': 16474256, 'train_runtime': '8334', 'train_tokens_per_second': '1977'} +{'loss': '0.7939', 'grad_norm': '1.682', 'learning_rate': '4.995e-05', 'epoch': '0.2027', 'num_input_tokens_seen': 16476303, 'train_runtime': '8335', 'train_tokens_per_second': '1977'} +{'loss': '0.3384', 'grad_norm': '1.001', 'learning_rate': '4.995e-05', 'epoch': '0.2027', 'num_input_tokens_seen': 16478350, 'train_runtime': '8336', 'train_tokens_per_second': '1977'} +{'loss': '0.5058', 'grad_norm': '1.286', 'learning_rate': '4.995e-05', 'epoch': '0.2027', 'num_input_tokens_seen': 16480397, 'train_runtime': '8337', 'train_tokens_per_second': '1977'} +{'loss': '0.9989', 'grad_norm': '1.716', 'learning_rate': '4.995e-05', 'epoch': '0.2027', 'num_input_tokens_seen': 16482444, 'train_runtime': '8338', 'train_tokens_per_second': '1977'} +{'loss': '1.106', 'grad_norm': '1.209', 'learning_rate': '4.995e-05', 'epoch': '0.2028', 'num_input_tokens_seen': 16484491, 'train_runtime': '8339', 'train_tokens_per_second': '1977'} +{'loss': '0.9766', 'grad_norm': '1.242', 'learning_rate': '4.995e-05', 'epoch': '0.2028', 'num_input_tokens_seen': 16486538, 'train_runtime': '8340', 'train_tokens_per_second': '1977'} +{'loss': '0.3227', 'grad_norm': '0.909', 'learning_rate': '4.995e-05', 'epoch': '0.2028', 'num_input_tokens_seen': 16488585, 'train_runtime': '8341', 'train_tokens_per_second': '1977'} +{'loss': '0.572', 'grad_norm': '1.367', 'learning_rate': '4.995e-05', 'epoch': '0.2028', 'num_input_tokens_seen': 16490632, 'train_runtime': '8342', 'train_tokens_per_second': '1977'} +{'loss': '0.6046', 'grad_norm': '0.9019', 'learning_rate': '4.995e-05', 'epoch': '0.2029', 'num_input_tokens_seen': 16492679, 'train_runtime': '8343', 'train_tokens_per_second': '1977'} +{'loss': '0.2527', 'grad_norm': '0.8998', 'learning_rate': '4.995e-05', 'epoch': '0.2029', 'num_input_tokens_seen': 16494726, 'train_runtime': '8344', 'train_tokens_per_second': '1977'} +{'loss': '0.5908', 'grad_norm': '1.126', 'learning_rate': '4.995e-05', 'epoch': '0.2029', 'num_input_tokens_seen': 16496773, 'train_runtime': '8345', 'train_tokens_per_second': '1977'} +{'loss': '1.062', 'grad_norm': '1.68', 'learning_rate': '4.995e-05', 'epoch': '0.2029', 'num_input_tokens_seen': 16498820, 'train_runtime': '8346', 'train_tokens_per_second': '1977'} +{'loss': '1.181', 'grad_norm': '1.755', 'learning_rate': '4.995e-05', 'epoch': '0.203', 'num_input_tokens_seen': 16500867, 'train_runtime': '8347', 'train_tokens_per_second': '1977'} +{'loss': '0.5103', 'grad_norm': '1.256', 'learning_rate': '4.995e-05', 'epoch': '0.203', 'num_input_tokens_seen': 16502914, 'train_runtime': '8348', 'train_tokens_per_second': '1977'} +{'loss': '0.8022', 'grad_norm': '1.225', 'learning_rate': '4.995e-05', 'epoch': '0.203', 'num_input_tokens_seen': 16504961, 'train_runtime': '8349', 'train_tokens_per_second': '1977'} +{'loss': '0.4425', 'grad_norm': '1.084', 'learning_rate': '4.995e-05', 'epoch': '0.203', 'num_input_tokens_seen': 16507008, 'train_runtime': '8350', 'train_tokens_per_second': '1977'} +{'loss': '0.9938', 'grad_norm': '1.694', 'learning_rate': '4.995e-05', 'epoch': '0.2031', 'num_input_tokens_seen': 16509055, 'train_runtime': '8351', 'train_tokens_per_second': '1977'} +{'loss': '1.057', 'grad_norm': '1.44', 'learning_rate': '4.995e-05', 'epoch': '0.2031', 'num_input_tokens_seen': 16511102, 'train_runtime': '8352', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '2.111', 'learning_rate': '4.995e-05', 'epoch': '0.2031', 'num_input_tokens_seen': 16513149, 'train_runtime': '8353', 'train_tokens_per_second': '1977'} +{'loss': '0.7316', 'grad_norm': '1.419', 'learning_rate': '4.995e-05', 'epoch': '0.2031', 'num_input_tokens_seen': 16515196, 'train_runtime': '8354', 'train_tokens_per_second': '1977'} +{'loss': '0.3399', 'grad_norm': '0.9108', 'learning_rate': '4.995e-05', 'epoch': '0.2032', 'num_input_tokens_seen': 16517243, 'train_runtime': '8355', 'train_tokens_per_second': '1977'} +{'loss': '0.5813', 'grad_norm': '1.384', 'learning_rate': '4.995e-05', 'epoch': '0.2032', 'num_input_tokens_seen': 16519290, 'train_runtime': '8356', 'train_tokens_per_second': '1977'} +{'loss': '2.067', 'grad_norm': '2.733', 'learning_rate': '4.995e-05', 'epoch': '0.2032', 'num_input_tokens_seen': 16521337, 'train_runtime': '8358', 'train_tokens_per_second': '1977'} +{'loss': '0.3931', 'grad_norm': '1.265', 'learning_rate': '4.995e-05', 'epoch': '0.2032', 'num_input_tokens_seen': 16523384, 'train_runtime': '8359', 'train_tokens_per_second': '1977'} +{'loss': '0.8576', 'grad_norm': '1.393', 'learning_rate': '4.995e-05', 'epoch': '0.2033', 'num_input_tokens_seen': 16525431, 'train_runtime': '8360', 'train_tokens_per_second': '1977'} +{'loss': '0.8491', 'grad_norm': '1.418', 'learning_rate': '4.995e-05', 'epoch': '0.2033', 'num_input_tokens_seen': 16527478, 'train_runtime': '8361', 'train_tokens_per_second': '1977'} +{'loss': '0.5079', 'grad_norm': '1.184', 'learning_rate': '4.995e-05', 'epoch': '0.2033', 'num_input_tokens_seen': 16529525, 'train_runtime': '8362', 'train_tokens_per_second': '1977'} +{'loss': '0.7002', 'grad_norm': '1.46', 'learning_rate': '4.995e-05', 'epoch': '0.2033', 'num_input_tokens_seen': 16531572, 'train_runtime': '8363', 'train_tokens_per_second': '1977'} +{'loss': '0.2856', 'grad_norm': '1.254', 'learning_rate': '4.995e-05', 'epoch': '0.2034', 'num_input_tokens_seen': 16533619, 'train_runtime': '8364', 'train_tokens_per_second': '1977'} +{'loss': '0.7362', 'grad_norm': '1.229', 'learning_rate': '4.995e-05', 'epoch': '0.2034', 'num_input_tokens_seen': 16535666, 'train_runtime': '8365', 'train_tokens_per_second': '1977'} +{'loss': '0.8547', 'grad_norm': '1.699', 'learning_rate': '4.995e-05', 'epoch': '0.2034', 'num_input_tokens_seen': 16537713, 'train_runtime': '8366', 'train_tokens_per_second': '1977'} +{'loss': '0.7575', 'grad_norm': '1.205', 'learning_rate': '4.995e-05', 'epoch': '0.2034', 'num_input_tokens_seen': 16539760, 'train_runtime': '8367', 'train_tokens_per_second': '1977'} +{'loss': '0.9508', 'grad_norm': '1.575', 'learning_rate': '4.995e-05', 'epoch': '0.2035', 'num_input_tokens_seen': 16541807, 'train_runtime': '8368', 'train_tokens_per_second': '1977'} +{'loss': '0.4815', 'grad_norm': '1.193', 'learning_rate': '4.995e-05', 'epoch': '0.2035', 'num_input_tokens_seen': 16543854, 'train_runtime': '8369', 'train_tokens_per_second': '1977'} +{'loss': '0.6441', 'grad_norm': '1.224', 'learning_rate': '4.994e-05', 'epoch': '0.2035', 'num_input_tokens_seen': 16545901, 'train_runtime': '8370', 'train_tokens_per_second': '1977'} +{'loss': '0.358', 'grad_norm': '0.9111', 'learning_rate': '4.994e-05', 'epoch': '0.2035', 'num_input_tokens_seen': 16547948, 'train_runtime': '8371', 'train_tokens_per_second': '1977'} +{'loss': '0.6835', 'grad_norm': '1.486', 'learning_rate': '4.994e-05', 'epoch': '0.2036', 'num_input_tokens_seen': 16549995, 'train_runtime': '8372', 'train_tokens_per_second': '1977'} +{'loss': '0.9287', 'grad_norm': '1.683', 'learning_rate': '4.994e-05', 'epoch': '0.2036', 'num_input_tokens_seen': 16552042, 'train_runtime': '8373', 'train_tokens_per_second': '1977'} +{'loss': '0.3975', 'grad_norm': '0.9061', 'learning_rate': '4.994e-05', 'epoch': '0.2036', 'num_input_tokens_seen': 16554089, 'train_runtime': '8374', 'train_tokens_per_second': '1977'} +{'loss': '1.375', 'grad_norm': '2.173', 'learning_rate': '4.994e-05', 'epoch': '0.2036', 'num_input_tokens_seen': 16556136, 'train_runtime': '8375', 'train_tokens_per_second': '1977'} +{'loss': '0.6044', 'grad_norm': '1.138', 'learning_rate': '4.994e-05', 'epoch': '0.2037', 'num_input_tokens_seen': 16558183, 'train_runtime': '8376', 'train_tokens_per_second': '1977'} +{'loss': '1.236', 'grad_norm': '2.313', 'learning_rate': '4.994e-05', 'epoch': '0.2037', 'num_input_tokens_seen': 16560230, 'train_runtime': '8377', 'train_tokens_per_second': '1977'} +{'loss': '2.084', 'grad_norm': '2.663', 'learning_rate': '4.994e-05', 'epoch': '0.2037', 'num_input_tokens_seen': 16562277, 'train_runtime': '8378', 'train_tokens_per_second': '1977'} +{'loss': '0.7639', 'grad_norm': '1.298', 'learning_rate': '4.994e-05', 'epoch': '0.2037', 'num_input_tokens_seen': 16564324, 'train_runtime': '8379', 'train_tokens_per_second': '1977'} +{'loss': '1.188', 'grad_norm': '1.831', 'learning_rate': '4.994e-05', 'epoch': '0.2038', 'num_input_tokens_seen': 16566371, 'train_runtime': '8380', 'train_tokens_per_second': '1977'} +{'loss': '2.129', 'grad_norm': '2.157', 'learning_rate': '4.994e-05', 'epoch': '0.2038', 'num_input_tokens_seen': 16568418, 'train_runtime': '8381', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '1.465', 'learning_rate': '4.994e-05', 'epoch': '0.2038', 'num_input_tokens_seen': 16570465, 'train_runtime': '8382', 'train_tokens_per_second': '1977'} +{'loss': '0.6729', 'grad_norm': '1.437', 'learning_rate': '4.994e-05', 'epoch': '0.2038', 'num_input_tokens_seen': 16572512, 'train_runtime': '8383', 'train_tokens_per_second': '1977'} +{'loss': '0.3964', 'grad_norm': '0.8753', 'learning_rate': '4.994e-05', 'epoch': '0.2039', 'num_input_tokens_seen': 16574559, 'train_runtime': '8384', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '1.878', 'learning_rate': '4.994e-05', 'epoch': '0.2039', 'num_input_tokens_seen': 16576606, 'train_runtime': '8385', 'train_tokens_per_second': '1977'} +{'loss': '0.5164', 'grad_norm': '1.097', 'learning_rate': '4.994e-05', 'epoch': '0.2039', 'num_input_tokens_seen': 16578653, 'train_runtime': '8386', 'train_tokens_per_second': '1977'} +{'loss': '0.369', 'grad_norm': '0.9007', 'learning_rate': '4.994e-05', 'epoch': '0.2039', 'num_input_tokens_seen': 16580700, 'train_runtime': '8387', 'train_tokens_per_second': '1977'} +{'loss': '0.7636', 'grad_norm': '1.373', 'learning_rate': '4.994e-05', 'epoch': '0.204', 'num_input_tokens_seen': 16582747, 'train_runtime': '8389', 'train_tokens_per_second': '1977'} +{'loss': '0.5035', 'grad_norm': '1.356', 'learning_rate': '4.994e-05', 'epoch': '0.204', 'num_input_tokens_seen': 16584794, 'train_runtime': '8390', 'train_tokens_per_second': '1977'} +{'loss': '1.769', 'grad_norm': '2.733', 'learning_rate': '4.994e-05', 'epoch': '0.204', 'num_input_tokens_seen': 16586841, 'train_runtime': '8391', 'train_tokens_per_second': '1977'} +{'loss': '0.7823', 'grad_norm': '1.231', 'learning_rate': '4.994e-05', 'epoch': '0.204', 'num_input_tokens_seen': 16588888, 'train_runtime': '8392', 'train_tokens_per_second': '1977'} +{'loss': '0.8669', 'grad_norm': '1.373', 'learning_rate': '4.994e-05', 'epoch': '0.2041', 'num_input_tokens_seen': 16590935, 'train_runtime': '8393', 'train_tokens_per_second': '1977'} +{'loss': '1.349', 'grad_norm': '2.157', 'learning_rate': '4.994e-05', 'epoch': '0.2041', 'num_input_tokens_seen': 16592982, 'train_runtime': '8394', 'train_tokens_per_second': '1977'} +{'loss': '0.5362', 'grad_norm': '1.368', 'learning_rate': '4.994e-05', 'epoch': '0.2041', 'num_input_tokens_seen': 16595029, 'train_runtime': '8395', 'train_tokens_per_second': '1977'} +{'loss': '0.9295', 'grad_norm': '1.531', 'learning_rate': '4.994e-05', 'epoch': '0.2041', 'num_input_tokens_seen': 16597076, 'train_runtime': '8396', 'train_tokens_per_second': '1977'} +{'loss': '1.211', 'grad_norm': '2.256', 'learning_rate': '4.994e-05', 'epoch': '0.2042', 'num_input_tokens_seen': 16599123, 'train_runtime': '8397', 'train_tokens_per_second': '1977'} +{'loss': '0.6793', 'grad_norm': '1.238', 'learning_rate': '4.994e-05', 'epoch': '0.2042', 'num_input_tokens_seen': 16601170, 'train_runtime': '8398', 'train_tokens_per_second': '1977'} +{'loss': '0.9635', 'grad_norm': '1.852', 'learning_rate': '4.994e-05', 'epoch': '0.2042', 'num_input_tokens_seen': 16603217, 'train_runtime': '8399', 'train_tokens_per_second': '1977'} +{'loss': '0.678', 'grad_norm': '1.401', 'learning_rate': '4.994e-05', 'epoch': '0.2042', 'num_input_tokens_seen': 16605264, 'train_runtime': '8400', 'train_tokens_per_second': '1977'} +{'loss': '0.5899', 'grad_norm': '1.156', 'learning_rate': '4.994e-05', 'epoch': '0.2043', 'num_input_tokens_seen': 16607311, 'train_runtime': '8401', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '1.45', 'learning_rate': '4.994e-05', 'epoch': '0.2043', 'num_input_tokens_seen': 16609358, 'train_runtime': '8402', 'train_tokens_per_second': '1977'} +{'loss': '0.5054', 'grad_norm': '1.139', 'learning_rate': '4.994e-05', 'epoch': '0.2043', 'num_input_tokens_seen': 16611405, 'train_runtime': '8403', 'train_tokens_per_second': '1977'} +{'loss': '1.094', 'grad_norm': '1.481', 'learning_rate': '4.994e-05', 'epoch': '0.2043', 'num_input_tokens_seen': 16613452, 'train_runtime': '8404', 'train_tokens_per_second': '1977'} +{'loss': '0.9912', 'grad_norm': '1.538', 'learning_rate': '4.994e-05', 'epoch': '0.2044', 'num_input_tokens_seen': 16615499, 'train_runtime': '8405', 'train_tokens_per_second': '1977'} +{'loss': '0.3485', 'grad_norm': '1.073', 'learning_rate': '4.994e-05', 'epoch': '0.2044', 'num_input_tokens_seen': 16617546, 'train_runtime': '8406', 'train_tokens_per_second': '1977'} +{'loss': '0.5486', 'grad_norm': '1.257', 'learning_rate': '4.994e-05', 'epoch': '0.2044', 'num_input_tokens_seen': 16619593, 'train_runtime': '8407', 'train_tokens_per_second': '1977'} +{'loss': '0.4424', 'grad_norm': '0.8972', 'learning_rate': '4.994e-05', 'epoch': '0.2044', 'num_input_tokens_seen': 16621640, 'train_runtime': '8408', 'train_tokens_per_second': '1977'} +{'loss': '0.6931', 'grad_norm': '1.282', 'learning_rate': '4.994e-05', 'epoch': '0.2045', 'num_input_tokens_seen': 16623687, 'train_runtime': '8409', 'train_tokens_per_second': '1977'} +{'loss': '1.351', 'grad_norm': '2.266', 'learning_rate': '4.994e-05', 'epoch': '0.2045', 'num_input_tokens_seen': 16625734, 'train_runtime': '8410', 'train_tokens_per_second': '1977'} +{'loss': '0.9368', 'grad_norm': '1.386', 'learning_rate': '4.994e-05', 'epoch': '0.2045', 'num_input_tokens_seen': 16627781, 'train_runtime': '8411', 'train_tokens_per_second': '1977'} +{'loss': '0.7714', 'grad_norm': '1.183', 'learning_rate': '4.994e-05', 'epoch': '0.2045', 'num_input_tokens_seen': 16629828, 'train_runtime': '8412', 'train_tokens_per_second': '1977'} +{'loss': '0.5139', 'grad_norm': '1.223', 'learning_rate': '4.994e-05', 'epoch': '0.2046', 'num_input_tokens_seen': 16631875, 'train_runtime': '8413', 'train_tokens_per_second': '1977'} +{'loss': '0.5754', 'grad_norm': '1.375', 'learning_rate': '4.994e-05', 'epoch': '0.2046', 'num_input_tokens_seen': 16633922, 'train_runtime': '8414', 'train_tokens_per_second': '1977'} +{'loss': '0.6629', 'grad_norm': '0.9487', 'learning_rate': '4.994e-05', 'epoch': '0.2046', 'num_input_tokens_seen': 16635969, 'train_runtime': '8415', 'train_tokens_per_second': '1977'} +{'loss': '0.9059', 'grad_norm': '1.3', 'learning_rate': '4.994e-05', 'epoch': '0.2046', 'num_input_tokens_seen': 16638016, 'train_runtime': '8416', 'train_tokens_per_second': '1977'} +{'loss': '1.336', 'grad_norm': '2.011', 'learning_rate': '4.994e-05', 'epoch': '0.2047', 'num_input_tokens_seen': 16640063, 'train_runtime': '8418', 'train_tokens_per_second': '1977'} +{'loss': '0.1678', 'grad_norm': '0.6956', 'learning_rate': '4.994e-05', 'epoch': '0.2047', 'num_input_tokens_seen': 16642110, 'train_runtime': '8419', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '1.604', 'learning_rate': '4.994e-05', 'epoch': '0.2047', 'num_input_tokens_seen': 16644157, 'train_runtime': '8420', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '1.465', 'learning_rate': '4.994e-05', 'epoch': '0.2047', 'num_input_tokens_seen': 16646204, 'train_runtime': '8421', 'train_tokens_per_second': '1977'} +{'loss': '0.3753', 'grad_norm': '0.8972', 'learning_rate': '4.994e-05', 'epoch': '0.2048', 'num_input_tokens_seen': 16648251, 'train_runtime': '8422', 'train_tokens_per_second': '1977'} +{'loss': '0.2948', 'grad_norm': '0.9018', 'learning_rate': '4.994e-05', 'epoch': '0.2048', 'num_input_tokens_seen': 16650298, 'train_runtime': '8423', 'train_tokens_per_second': '1977'} +{'loss': '0.6698', 'grad_norm': '1.441', 'learning_rate': '4.994e-05', 'epoch': '0.2048', 'num_input_tokens_seen': 16652345, 'train_runtime': '8424', 'train_tokens_per_second': '1977'} +{'loss': '1.286', 'grad_norm': '2.011', 'learning_rate': '4.994e-05', 'epoch': '0.2048', 'num_input_tokens_seen': 16654392, 'train_runtime': '8425', 'train_tokens_per_second': '1977'} +{'loss': '0.3995', 'grad_norm': '0.9352', 'learning_rate': '4.994e-05', 'epoch': '0.2049', 'num_input_tokens_seen': 16656439, 'train_runtime': '8426', 'train_tokens_per_second': '1977'} +{'loss': '0.9883', 'grad_norm': '1.514', 'learning_rate': '4.994e-05', 'epoch': '0.2049', 'num_input_tokens_seen': 16658486, 'train_runtime': '8427', 'train_tokens_per_second': '1977'} +{'loss': '1.745', 'grad_norm': '2.882', 'learning_rate': '4.994e-05', 'epoch': '0.2049', 'num_input_tokens_seen': 16660533, 'train_runtime': '8428', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '2.23', 'learning_rate': '4.994e-05', 'epoch': '0.205', 'num_input_tokens_seen': 16662580, 'train_runtime': '8429', 'train_tokens_per_second': '1977'} +{'loss': '1.206', 'grad_norm': '1.338', 'learning_rate': '4.994e-05', 'epoch': '0.205', 'num_input_tokens_seen': 16664627, 'train_runtime': '8430', 'train_tokens_per_second': '1977'} +{'loss': '0.415', 'grad_norm': '1.083', 'learning_rate': '4.994e-05', 'epoch': '0.205', 'num_input_tokens_seen': 16666674, 'train_runtime': '8431', 'train_tokens_per_second': '1977'} +{'loss': '0.9463', 'grad_norm': '2.174', 'learning_rate': '4.994e-05', 'epoch': '0.205', 'num_input_tokens_seen': 16668721, 'train_runtime': '8432', 'train_tokens_per_second': '1977'} +{'loss': '1.045', 'grad_norm': '1.524', 'learning_rate': '4.994e-05', 'epoch': '0.2051', 'num_input_tokens_seen': 16670768, 'train_runtime': '8433', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '1.54', 'learning_rate': '4.994e-05', 'epoch': '0.2051', 'num_input_tokens_seen': 16672815, 'train_runtime': '8434', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '2.172', 'learning_rate': '4.994e-05', 'epoch': '0.2051', 'num_input_tokens_seen': 16674862, 'train_runtime': '8435', 'train_tokens_per_second': '1977'} +{'loss': '1.854', 'grad_norm': '2.409', 'learning_rate': '4.994e-05', 'epoch': '0.2051', 'num_input_tokens_seen': 16676909, 'train_runtime': '8436', 'train_tokens_per_second': '1977'} +{'loss': '1.365', 'grad_norm': '2.12', 'learning_rate': '4.994e-05', 'epoch': '0.2052', 'num_input_tokens_seen': 16678956, 'train_runtime': '8437', 'train_tokens_per_second': '1977'} +{'loss': '0.3058', 'grad_norm': '0.9996', 'learning_rate': '4.994e-05', 'epoch': '0.2052', 'num_input_tokens_seen': 16681003, 'train_runtime': '8438', 'train_tokens_per_second': '1977'} +{'loss': '1.043', 'grad_norm': '1.6', 'learning_rate': '4.994e-05', 'epoch': '0.2052', 'num_input_tokens_seen': 16683050, 'train_runtime': '8439', 'train_tokens_per_second': '1977'} +{'loss': '0.7416', 'grad_norm': '1.518', 'learning_rate': '4.994e-05', 'epoch': '0.2052', 'num_input_tokens_seen': 16685097, 'train_runtime': '8440', 'train_tokens_per_second': '1977'} +{'loss': '0.5581', 'grad_norm': '1.097', 'learning_rate': '4.994e-05', 'epoch': '0.2053', 'num_input_tokens_seen': 16687144, 'train_runtime': '8441', 'train_tokens_per_second': '1977'} +{'loss': '0.8488', 'grad_norm': '1.496', 'learning_rate': '4.994e-05', 'epoch': '0.2053', 'num_input_tokens_seen': 16689191, 'train_runtime': '8442', 'train_tokens_per_second': '1977'} +{'loss': '0.9801', 'grad_norm': '1.43', 'learning_rate': '4.994e-05', 'epoch': '0.2053', 'num_input_tokens_seen': 16691238, 'train_runtime': '8443', 'train_tokens_per_second': '1977'} +{'loss': '0.6006', 'grad_norm': '1.356', 'learning_rate': '4.994e-05', 'epoch': '0.2053', 'num_input_tokens_seen': 16693285, 'train_runtime': '8444', 'train_tokens_per_second': '1977'} +{'loss': '1.006', 'grad_norm': '1.571', 'learning_rate': '4.994e-05', 'epoch': '0.2054', 'num_input_tokens_seen': 16695332, 'train_runtime': '8445', 'train_tokens_per_second': '1977'} +{'loss': '1.397', 'grad_norm': '2.134', 'learning_rate': '4.994e-05', 'epoch': '0.2054', 'num_input_tokens_seen': 16697379, 'train_runtime': '8446', 'train_tokens_per_second': '1977'} +{'loss': '0.4716', 'grad_norm': '1.023', 'learning_rate': '4.994e-05', 'epoch': '0.2054', 'num_input_tokens_seen': 16699426, 'train_runtime': '8448', 'train_tokens_per_second': '1977'} +{'loss': '0.8619', 'grad_norm': '1.458', 'learning_rate': '4.994e-05', 'epoch': '0.2054', 'num_input_tokens_seen': 16701473, 'train_runtime': '8449', 'train_tokens_per_second': '1977'} +{'loss': '0.7228', 'grad_norm': '1.566', 'learning_rate': '4.994e-05', 'epoch': '0.2055', 'num_input_tokens_seen': 16703520, 'train_runtime': '8450', 'train_tokens_per_second': '1977'} +{'loss': '1.328', 'grad_norm': '2.193', 'learning_rate': '4.994e-05', 'epoch': '0.2055', 'num_input_tokens_seen': 16705567, 'train_runtime': '8451', 'train_tokens_per_second': '1977'} +{'loss': '0.2114', 'grad_norm': '0.854', 'learning_rate': '4.994e-05', 'epoch': '0.2055', 'num_input_tokens_seen': 16707614, 'train_runtime': '8452', 'train_tokens_per_second': '1977'} +{'loss': '0.9658', 'grad_norm': '1.362', 'learning_rate': '4.994e-05', 'epoch': '0.2055', 'num_input_tokens_seen': 16709661, 'train_runtime': '8453', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '1.942', 'learning_rate': '4.994e-05', 'epoch': '0.2056', 'num_input_tokens_seen': 16711708, 'train_runtime': '8454', 'train_tokens_per_second': '1977'} +{'loss': '0.6568', 'grad_norm': '0.9023', 'learning_rate': '4.994e-05', 'epoch': '0.2056', 'num_input_tokens_seen': 16713755, 'train_runtime': '8455', 'train_tokens_per_second': '1977'} +{'loss': '0.916', 'grad_norm': '1.678', 'learning_rate': '4.994e-05', 'epoch': '0.2056', 'num_input_tokens_seen': 16715802, 'train_runtime': '8456', 'train_tokens_per_second': '1977'} +{'loss': '0.4358', 'grad_norm': '1.246', 'learning_rate': '4.994e-05', 'epoch': '0.2056', 'num_input_tokens_seen': 16717849, 'train_runtime': '8457', 'train_tokens_per_second': '1977'} +{'loss': '0.5081', 'grad_norm': '1.04', 'learning_rate': '4.994e-05', 'epoch': '0.2057', 'num_input_tokens_seen': 16719896, 'train_runtime': '8458', 'train_tokens_per_second': '1977'} +{'loss': '1.569', 'grad_norm': '2.408', 'learning_rate': '4.994e-05', 'epoch': '0.2057', 'num_input_tokens_seen': 16721943, 'train_runtime': '8459', 'train_tokens_per_second': '1977'} +{'loss': '0.8364', 'grad_norm': '1.563', 'learning_rate': '4.994e-05', 'epoch': '0.2057', 'num_input_tokens_seen': 16723990, 'train_runtime': '8460', 'train_tokens_per_second': '1977'} +{'loss': '2.006', 'grad_norm': '2.385', 'learning_rate': '4.994e-05', 'epoch': '0.2057', 'num_input_tokens_seen': 16726037, 'train_runtime': '8461', 'train_tokens_per_second': '1977'} +{'loss': '0.6998', 'grad_norm': '1.209', 'learning_rate': '4.994e-05', 'epoch': '0.2058', 'num_input_tokens_seen': 16728084, 'train_runtime': '8462', 'train_tokens_per_second': '1977'} +{'loss': '0.6116', 'grad_norm': '1.433', 'learning_rate': '4.994e-05', 'epoch': '0.2058', 'num_input_tokens_seen': 16730131, 'train_runtime': '8463', 'train_tokens_per_second': '1977'} +{'loss': '1.504', 'grad_norm': '2.005', 'learning_rate': '4.994e-05', 'epoch': '0.2058', 'num_input_tokens_seen': 16732178, 'train_runtime': '8464', 'train_tokens_per_second': '1977'} +{'loss': '0.874', 'grad_norm': '1.801', 'learning_rate': '4.994e-05', 'epoch': '0.2058', 'num_input_tokens_seen': 16734225, 'train_runtime': '8465', 'train_tokens_per_second': '1977'} +{'loss': '0.5226', 'grad_norm': '1.39', 'learning_rate': '4.994e-05', 'epoch': '0.2059', 'num_input_tokens_seen': 16736272, 'train_runtime': '8466', 'train_tokens_per_second': '1977'} +{'loss': '0.7866', 'grad_norm': '1.498', 'learning_rate': '4.994e-05', 'epoch': '0.2059', 'num_input_tokens_seen': 16738319, 'train_runtime': '8467', 'train_tokens_per_second': '1977'} +{'loss': '1.627', 'grad_norm': '2.416', 'learning_rate': '4.994e-05', 'epoch': '0.2059', 'num_input_tokens_seen': 16740366, 'train_runtime': '8468', 'train_tokens_per_second': '1977'} +{'loss': '0.7043', 'grad_norm': '1.074', 'learning_rate': '4.994e-05', 'epoch': '0.2059', 'num_input_tokens_seen': 16742413, 'train_runtime': '8469', 'train_tokens_per_second': '1977'} +{'loss': '0.4072', 'grad_norm': '1.107', 'learning_rate': '4.994e-05', 'epoch': '0.206', 'num_input_tokens_seen': 16744460, 'train_runtime': '8470', 'train_tokens_per_second': '1977'} +{'loss': '0.8405', 'grad_norm': '1.501', 'learning_rate': '4.994e-05', 'epoch': '0.206', 'num_input_tokens_seen': 16746507, 'train_runtime': '8471', 'train_tokens_per_second': '1977'} +{'loss': '0.5308', 'grad_norm': '1.184', 'learning_rate': '4.994e-05', 'epoch': '0.206', 'num_input_tokens_seen': 16748554, 'train_runtime': '8472', 'train_tokens_per_second': '1977'} +{'loss': '0.7909', 'grad_norm': '1.499', 'learning_rate': '4.994e-05', 'epoch': '0.206', 'num_input_tokens_seen': 16750601, 'train_runtime': '8473', 'train_tokens_per_second': '1977'} +{'loss': '1.32', 'grad_norm': '2.175', 'learning_rate': '4.994e-05', 'epoch': '0.2061', 'num_input_tokens_seen': 16752648, 'train_runtime': '8474', 'train_tokens_per_second': '1977'} +{'loss': '0.4013', 'grad_norm': '1.168', 'learning_rate': '4.994e-05', 'epoch': '0.2061', 'num_input_tokens_seen': 16754695, 'train_runtime': '8475', 'train_tokens_per_second': '1977'} +{'loss': '0.5262', 'grad_norm': '1.101', 'learning_rate': '4.994e-05', 'epoch': '0.2061', 'num_input_tokens_seen': 16756742, 'train_runtime': '8477', 'train_tokens_per_second': '1977'} +{'loss': '0.3822', 'grad_norm': '0.9625', 'learning_rate': '4.994e-05', 'epoch': '0.2061', 'num_input_tokens_seen': 16758789, 'train_runtime': '8478', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.695', 'learning_rate': '4.994e-05', 'epoch': '0.2062', 'num_input_tokens_seen': 16760836, 'train_runtime': '8479', 'train_tokens_per_second': '1977'} +{'loss': '0.428', 'grad_norm': '1.045', 'learning_rate': '4.994e-05', 'epoch': '0.2062', 'num_input_tokens_seen': 16762883, 'train_runtime': '8480', 'train_tokens_per_second': '1977'} +{'loss': '0.2695', 'grad_norm': '0.9533', 'learning_rate': '4.994e-05', 'epoch': '0.2062', 'num_input_tokens_seen': 16764930, 'train_runtime': '8481', 'train_tokens_per_second': '1977'} +{'loss': '1.509', 'grad_norm': '2.136', 'learning_rate': '4.994e-05', 'epoch': '0.2062', 'num_input_tokens_seen': 16766977, 'train_runtime': '8482', 'train_tokens_per_second': '1977'} +{'loss': '0.8111', 'grad_norm': '1.527', 'learning_rate': '4.994e-05', 'epoch': '0.2063', 'num_input_tokens_seen': 16769024, 'train_runtime': '8483', 'train_tokens_per_second': '1977'} +{'loss': '0.5854', 'grad_norm': '1.271', 'learning_rate': '4.994e-05', 'epoch': '0.2063', 'num_input_tokens_seen': 16771071, 'train_runtime': '8484', 'train_tokens_per_second': '1977'} +{'loss': '2.25', 'grad_norm': '2.14', 'learning_rate': '4.994e-05', 'epoch': '0.2063', 'num_input_tokens_seen': 16773118, 'train_runtime': '8485', 'train_tokens_per_second': '1977'} +{'loss': '2.472', 'grad_norm': '2.572', 'learning_rate': '4.994e-05', 'epoch': '0.2063', 'num_input_tokens_seen': 16775165, 'train_runtime': '8486', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '2.015', 'learning_rate': '4.994e-05', 'epoch': '0.2064', 'num_input_tokens_seen': 16777212, 'train_runtime': '8487', 'train_tokens_per_second': '1977'} +{'loss': '0.4615', 'grad_norm': '1.158', 'learning_rate': '4.994e-05', 'epoch': '0.2064', 'num_input_tokens_seen': 16779259, 'train_runtime': '8488', 'train_tokens_per_second': '1977'} +{'loss': '1.756', 'grad_norm': '2.481', 'learning_rate': '4.994e-05', 'epoch': '0.2064', 'num_input_tokens_seen': 16781306, 'train_runtime': '8489', 'train_tokens_per_second': '1977'} +{'loss': '0.2978', 'grad_norm': '1.046', 'learning_rate': '4.994e-05', 'epoch': '0.2064', 'num_input_tokens_seen': 16783353, 'train_runtime': '8490', 'train_tokens_per_second': '1977'} +{'loss': '0.3115', 'grad_norm': '1.012', 'learning_rate': '4.994e-05', 'epoch': '0.2065', 'num_input_tokens_seen': 16785400, 'train_runtime': '8491', 'train_tokens_per_second': '1977'} +{'loss': '0.7651', 'grad_norm': '1.309', 'learning_rate': '4.994e-05', 'epoch': '0.2065', 'num_input_tokens_seen': 16787447, 'train_runtime': '8492', 'train_tokens_per_second': '1977'} +{'loss': '0.3003', 'grad_norm': '0.9813', 'learning_rate': '4.994e-05', 'epoch': '0.2065', 'num_input_tokens_seen': 16789494, 'train_runtime': '8493', 'train_tokens_per_second': '1977'} +{'loss': '0.8662', 'grad_norm': '1.408', 'learning_rate': '4.994e-05', 'epoch': '0.2065', 'num_input_tokens_seen': 16791541, 'train_runtime': '8494', 'train_tokens_per_second': '1977'} +{'loss': '0.8057', 'grad_norm': '1.48', 'learning_rate': '4.994e-05', 'epoch': '0.2066', 'num_input_tokens_seen': 16793588, 'train_runtime': '8495', 'train_tokens_per_second': '1977'} +{'loss': '0.798', 'grad_norm': '1.355', 'learning_rate': '4.994e-05', 'epoch': '0.2066', 'num_input_tokens_seen': 16795635, 'train_runtime': '8496', 'train_tokens_per_second': '1977'} +{'loss': '0.9821', 'grad_norm': '1.535', 'learning_rate': '4.994e-05', 'epoch': '0.2066', 'num_input_tokens_seen': 16797682, 'train_runtime': '8497', 'train_tokens_per_second': '1977'} +{'loss': '0.6158', 'grad_norm': '0.9464', 'learning_rate': '4.994e-05', 'epoch': '0.2066', 'num_input_tokens_seen': 16799729, 'train_runtime': '8498', 'train_tokens_per_second': '1977'} +{'loss': '2.653', 'grad_norm': '2.554', 'learning_rate': '4.994e-05', 'epoch': '0.2067', 'num_input_tokens_seen': 16801776, 'train_runtime': '8499', 'train_tokens_per_second': '1977'} +{'loss': '0.3652', 'grad_norm': '0.9533', 'learning_rate': '4.994e-05', 'epoch': '0.2067', 'num_input_tokens_seen': 16803823, 'train_runtime': '8500', 'train_tokens_per_second': '1977'} +{'loss': '1.022', 'grad_norm': '1.627', 'learning_rate': '4.994e-05', 'epoch': '0.2067', 'num_input_tokens_seen': 16805870, 'train_runtime': '8501', 'train_tokens_per_second': '1977'} +{'loss': '1.233', 'grad_norm': '1.826', 'learning_rate': '4.994e-05', 'epoch': '0.2067', 'num_input_tokens_seen': 16807917, 'train_runtime': '8502', 'train_tokens_per_second': '1977'} +{'loss': '0.6496', 'grad_norm': '1.176', 'learning_rate': '4.994e-05', 'epoch': '0.2068', 'num_input_tokens_seen': 16809964, 'train_runtime': '8503', 'train_tokens_per_second': '1977'} +{'loss': '0.9222', 'grad_norm': '1.38', 'learning_rate': '4.994e-05', 'epoch': '0.2068', 'num_input_tokens_seen': 16812011, 'train_runtime': '8504', 'train_tokens_per_second': '1977'} +{'loss': '0.203', 'grad_norm': '0.8113', 'learning_rate': '4.994e-05', 'epoch': '0.2068', 'num_input_tokens_seen': 16814058, 'train_runtime': '8506', 'train_tokens_per_second': '1977'} +{'loss': '0.3814', 'grad_norm': '0.9868', 'learning_rate': '4.994e-05', 'epoch': '0.2068', 'num_input_tokens_seen': 16816105, 'train_runtime': '8507', 'train_tokens_per_second': '1977'} +{'loss': '0.3463', 'grad_norm': '0.9284', 'learning_rate': '4.994e-05', 'epoch': '0.2069', 'num_input_tokens_seen': 16818152, 'train_runtime': '8508', 'train_tokens_per_second': '1977'} +{'loss': '0.7045', 'grad_norm': '1.566', 'learning_rate': '4.994e-05', 'epoch': '0.2069', 'num_input_tokens_seen': 16820199, 'train_runtime': '8509', 'train_tokens_per_second': '1977'} +{'loss': '0.5141', 'grad_norm': '1.573', 'learning_rate': '4.994e-05', 'epoch': '0.2069', 'num_input_tokens_seen': 16822246, 'train_runtime': '8510', 'train_tokens_per_second': '1977'} +{'loss': '0.4416', 'grad_norm': '1.058', 'learning_rate': '4.994e-05', 'epoch': '0.2069', 'num_input_tokens_seen': 16824293, 'train_runtime': '8511', 'train_tokens_per_second': '1977'} +{'loss': '0.2721', 'grad_norm': '0.9414', 'learning_rate': '4.994e-05', 'epoch': '0.207', 'num_input_tokens_seen': 16826340, 'train_runtime': '8512', 'train_tokens_per_second': '1977'} +{'loss': '1.024', 'grad_norm': '1.898', 'learning_rate': '4.994e-05', 'epoch': '0.207', 'num_input_tokens_seen': 16828387, 'train_runtime': '8513', 'train_tokens_per_second': '1977'} +{'loss': '0.3609', 'grad_norm': '0.9707', 'learning_rate': '4.994e-05', 'epoch': '0.207', 'num_input_tokens_seen': 16830434, 'train_runtime': '8514', 'train_tokens_per_second': '1977'} +{'loss': '3.106', 'grad_norm': '2.551', 'learning_rate': '4.994e-05', 'epoch': '0.207', 'num_input_tokens_seen': 16832481, 'train_runtime': '8515', 'train_tokens_per_second': '1977'} +{'loss': '1.169', 'grad_norm': '1.718', 'learning_rate': '4.994e-05', 'epoch': '0.2071', 'num_input_tokens_seen': 16834528, 'train_runtime': '8516', 'train_tokens_per_second': '1977'} +{'loss': '0.6107', 'grad_norm': '0.9703', 'learning_rate': '4.994e-05', 'epoch': '0.2071', 'num_input_tokens_seen': 16836575, 'train_runtime': '8517', 'train_tokens_per_second': '1977'} +{'loss': '2.421', 'grad_norm': '2.479', 'learning_rate': '4.994e-05', 'epoch': '0.2071', 'num_input_tokens_seen': 16838622, 'train_runtime': '8518', 'train_tokens_per_second': '1977'} +{'loss': '0.7007', 'grad_norm': '0.9204', 'learning_rate': '4.994e-05', 'epoch': '0.2071', 'num_input_tokens_seen': 16840669, 'train_runtime': '8519', 'train_tokens_per_second': '1977'} +{'loss': '0.3389', 'grad_norm': '1.062', 'learning_rate': '4.994e-05', 'epoch': '0.2072', 'num_input_tokens_seen': 16842716, 'train_runtime': '8520', 'train_tokens_per_second': '1977'} +{'loss': '0.3009', 'grad_norm': '0.8122', 'learning_rate': '4.994e-05', 'epoch': '0.2072', 'num_input_tokens_seen': 16844763, 'train_runtime': '8521', 'train_tokens_per_second': '1977'} +{'loss': '0.8127', 'grad_norm': '1.361', 'learning_rate': '4.994e-05', 'epoch': '0.2072', 'num_input_tokens_seen': 16846810, 'train_runtime': '8522', 'train_tokens_per_second': '1977'} +{'loss': '0.3834', 'grad_norm': '0.8411', 'learning_rate': '4.994e-05', 'epoch': '0.2072', 'num_input_tokens_seen': 16848857, 'train_runtime': '8523', 'train_tokens_per_second': '1977'} +{'loss': '0.6377', 'grad_norm': '1.088', 'learning_rate': '4.994e-05', 'epoch': '0.2073', 'num_input_tokens_seen': 16850904, 'train_runtime': '8524', 'train_tokens_per_second': '1977'} +{'loss': '0.5461', 'grad_norm': '1.058', 'learning_rate': '4.994e-05', 'epoch': '0.2073', 'num_input_tokens_seen': 16852951, 'train_runtime': '8525', 'train_tokens_per_second': '1977'} +{'loss': '0.3541', 'grad_norm': '0.9836', 'learning_rate': '4.994e-05', 'epoch': '0.2073', 'num_input_tokens_seen': 16854998, 'train_runtime': '8526', 'train_tokens_per_second': '1977'} +{'loss': '0.7353', 'grad_norm': '1.098', 'learning_rate': '4.994e-05', 'epoch': '0.2073', 'num_input_tokens_seen': 16857045, 'train_runtime': '8527', 'train_tokens_per_second': '1977'} +{'loss': '0.7944', 'grad_norm': '1.518', 'learning_rate': '4.994e-05', 'epoch': '0.2074', 'num_input_tokens_seen': 16859092, 'train_runtime': '8528', 'train_tokens_per_second': '1977'} +{'loss': '0.4012', 'grad_norm': '0.9639', 'learning_rate': '4.994e-05', 'epoch': '0.2074', 'num_input_tokens_seen': 16861139, 'train_runtime': '8529', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.946', 'learning_rate': '4.994e-05', 'epoch': '0.2074', 'num_input_tokens_seen': 16863186, 'train_runtime': '8530', 'train_tokens_per_second': '1977'} +{'loss': '0.6295', 'grad_norm': '1.187', 'learning_rate': '4.994e-05', 'epoch': '0.2074', 'num_input_tokens_seen': 16865233, 'train_runtime': '8531', 'train_tokens_per_second': '1977'} +{'loss': '1.252', 'grad_norm': '2.166', 'learning_rate': '4.994e-05', 'epoch': '0.2075', 'num_input_tokens_seen': 16867280, 'train_runtime': '8532', 'train_tokens_per_second': '1977'} +{'loss': '0.4314', 'grad_norm': '1.006', 'learning_rate': '4.994e-05', 'epoch': '0.2075', 'num_input_tokens_seen': 16869327, 'train_runtime': '8533', 'train_tokens_per_second': '1977'} +{'loss': '0.5695', 'grad_norm': '1.438', 'learning_rate': '4.994e-05', 'epoch': '0.2075', 'num_input_tokens_seen': 16871374, 'train_runtime': '8535', 'train_tokens_per_second': '1977'} +{'loss': '0.5786', 'grad_norm': '2.59', 'learning_rate': '4.994e-05', 'epoch': '0.2075', 'num_input_tokens_seen': 16873421, 'train_runtime': '8536', 'train_tokens_per_second': '1977'} +{'loss': '0.515', 'grad_norm': '1.261', 'learning_rate': '4.994e-05', 'epoch': '0.2076', 'num_input_tokens_seen': 16875468, 'train_runtime': '8537', 'train_tokens_per_second': '1977'} +{'loss': '0.4292', 'grad_norm': '0.9894', 'learning_rate': '4.994e-05', 'epoch': '0.2076', 'num_input_tokens_seen': 16877515, 'train_runtime': '8538', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '1.556', 'learning_rate': '4.994e-05', 'epoch': '0.2076', 'num_input_tokens_seen': 16879562, 'train_runtime': '8539', 'train_tokens_per_second': '1977'} +{'loss': '1.059', 'grad_norm': '1.875', 'learning_rate': '4.994e-05', 'epoch': '0.2076', 'num_input_tokens_seen': 16881609, 'train_runtime': '8540', 'train_tokens_per_second': '1977'} +{'loss': '0.3541', 'grad_norm': '0.7679', 'learning_rate': '4.994e-05', 'epoch': '0.2077', 'num_input_tokens_seen': 16883656, 'train_runtime': '8541', 'train_tokens_per_second': '1977'} +{'loss': '2.068', 'grad_norm': '5.992', 'learning_rate': '4.994e-05', 'epoch': '0.2077', 'num_input_tokens_seen': 16885703, 'train_runtime': '8542', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '2.027', 'learning_rate': '4.994e-05', 'epoch': '0.2077', 'num_input_tokens_seen': 16887750, 'train_runtime': '8543', 'train_tokens_per_second': '1977'} +{'loss': '0.3029', 'grad_norm': '0.9541', 'learning_rate': '4.994e-05', 'epoch': '0.2077', 'num_input_tokens_seen': 16889797, 'train_runtime': '8544', 'train_tokens_per_second': '1977'} +{'loss': '1.861', 'grad_norm': '2.716', 'learning_rate': '4.994e-05', 'epoch': '0.2078', 'num_input_tokens_seen': 16891844, 'train_runtime': '8545', 'train_tokens_per_second': '1977'} +{'loss': '0.4123', 'grad_norm': '1.004', 'learning_rate': '4.994e-05', 'epoch': '0.2078', 'num_input_tokens_seen': 16893891, 'train_runtime': '8546', 'train_tokens_per_second': '1977'} +{'loss': '1.1', 'grad_norm': '1.664', 'learning_rate': '4.994e-05', 'epoch': '0.2078', 'num_input_tokens_seen': 16895938, 'train_runtime': '8547', 'train_tokens_per_second': '1977'} +{'loss': '1.479', 'grad_norm': '2.249', 'learning_rate': '4.994e-05', 'epoch': '0.2078', 'num_input_tokens_seen': 16897985, 'train_runtime': '8548', 'train_tokens_per_second': '1977'} +{'loss': '1.854', 'grad_norm': '2.61', 'learning_rate': '4.994e-05', 'epoch': '0.2079', 'num_input_tokens_seen': 16900032, 'train_runtime': '8549', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '1.926', 'learning_rate': '4.994e-05', 'epoch': '0.2079', 'num_input_tokens_seen': 16902079, 'train_runtime': '8550', 'train_tokens_per_second': '1977'} +{'loss': '0.2238', 'grad_norm': '0.8559', 'learning_rate': '4.994e-05', 'epoch': '0.2079', 'num_input_tokens_seen': 16904126, 'train_runtime': '8551', 'train_tokens_per_second': '1977'} +{'loss': '0.4059', 'grad_norm': '0.9366', 'learning_rate': '4.994e-05', 'epoch': '0.2079', 'num_input_tokens_seen': 16906173, 'train_runtime': '8552', 'train_tokens_per_second': '1977'} +{'loss': '2.086', 'grad_norm': '2.813', 'learning_rate': '4.994e-05', 'epoch': '0.208', 'num_input_tokens_seen': 16908220, 'train_runtime': '8553', 'train_tokens_per_second': '1977'} +{'loss': '0.4734', 'grad_norm': '1.048', 'learning_rate': '4.994e-05', 'epoch': '0.208', 'num_input_tokens_seen': 16910267, 'train_runtime': '8554', 'train_tokens_per_second': '1977'} +{'loss': '0.8956', 'grad_norm': '1.549', 'learning_rate': '4.994e-05', 'epoch': '0.208', 'num_input_tokens_seen': 16912314, 'train_runtime': '8555', 'train_tokens_per_second': '1977'} +{'loss': '0.5896', 'grad_norm': '1.359', 'learning_rate': '4.994e-05', 'epoch': '0.208', 'num_input_tokens_seen': 16914361, 'train_runtime': '8556', 'train_tokens_per_second': '1977'} +{'loss': '0.6947', 'grad_norm': '1.099', 'learning_rate': '4.994e-05', 'epoch': '0.2081', 'num_input_tokens_seen': 16916408, 'train_runtime': '8557', 'train_tokens_per_second': '1977'} +{'loss': '0.2968', 'grad_norm': '0.9112', 'learning_rate': '4.994e-05', 'epoch': '0.2081', 'num_input_tokens_seen': 16918455, 'train_runtime': '8558', 'train_tokens_per_second': '1977'} +{'loss': '0.6614', 'grad_norm': '0.9331', 'learning_rate': '4.994e-05', 'epoch': '0.2081', 'num_input_tokens_seen': 16920502, 'train_runtime': '8560', 'train_tokens_per_second': '1977'} +{'loss': '0.7092', 'grad_norm': '1.438', 'learning_rate': '4.994e-05', 'epoch': '0.2081', 'num_input_tokens_seen': 16922549, 'train_runtime': '8561', 'train_tokens_per_second': '1977'} +{'loss': '0.5056', 'grad_norm': '1.053', 'learning_rate': '4.994e-05', 'epoch': '0.2082', 'num_input_tokens_seen': 16924596, 'train_runtime': '8562', 'train_tokens_per_second': '1977'} +{'loss': '1.219', 'grad_norm': '1.773', 'learning_rate': '4.994e-05', 'epoch': '0.2082', 'num_input_tokens_seen': 16926643, 'train_runtime': '8563', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '1.666', 'learning_rate': '4.994e-05', 'epoch': '0.2082', 'num_input_tokens_seen': 16928690, 'train_runtime': '8564', 'train_tokens_per_second': '1977'} +{'loss': '0.4455', 'grad_norm': '1.104', 'learning_rate': '4.994e-05', 'epoch': '0.2082', 'num_input_tokens_seen': 16930737, 'train_runtime': '8565', 'train_tokens_per_second': '1977'} +{'loss': '0.8877', 'grad_norm': '1.454', 'learning_rate': '4.994e-05', 'epoch': '0.2083', 'num_input_tokens_seen': 16932784, 'train_runtime': '8566', 'train_tokens_per_second': '1977'} +{'loss': '0.8558', 'grad_norm': '1.235', 'learning_rate': '4.994e-05', 'epoch': '0.2083', 'num_input_tokens_seen': 16934831, 'train_runtime': '8567', 'train_tokens_per_second': '1977'} +{'loss': '1.093', 'grad_norm': '2.17', 'learning_rate': '4.994e-05', 'epoch': '0.2083', 'num_input_tokens_seen': 16936878, 'train_runtime': '8568', 'train_tokens_per_second': '1977'} +{'loss': '0.3239', 'grad_norm': '0.9176', 'learning_rate': '4.994e-05', 'epoch': '0.2083', 'num_input_tokens_seen': 16938925, 'train_runtime': '8569', 'train_tokens_per_second': '1977'} +{'loss': '0.5243', 'grad_norm': '1.168', 'learning_rate': '4.994e-05', 'epoch': '0.2084', 'num_input_tokens_seen': 16940972, 'train_runtime': '8570', 'train_tokens_per_second': '1977'} +{'loss': '0.8363', 'grad_norm': '1.309', 'learning_rate': '4.994e-05', 'epoch': '0.2084', 'num_input_tokens_seen': 16943019, 'train_runtime': '8571', 'train_tokens_per_second': '1977'} +{'loss': '0.9257', 'grad_norm': '1.714', 'learning_rate': '4.994e-05', 'epoch': '0.2084', 'num_input_tokens_seen': 16945066, 'train_runtime': '8572', 'train_tokens_per_second': '1977'} +{'loss': '0.5019', 'grad_norm': '1.169', 'learning_rate': '4.994e-05', 'epoch': '0.2084', 'num_input_tokens_seen': 16947113, 'train_runtime': '8573', 'train_tokens_per_second': '1977'} +{'loss': '0.3435', 'grad_norm': '0.848', 'learning_rate': '4.994e-05', 'epoch': '0.2085', 'num_input_tokens_seen': 16949160, 'train_runtime': '8574', 'train_tokens_per_second': '1977'} +{'loss': '0.7047', 'grad_norm': '1.598', 'learning_rate': '4.994e-05', 'epoch': '0.2085', 'num_input_tokens_seen': 16951207, 'train_runtime': '8575', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '1.208', 'learning_rate': '4.994e-05', 'epoch': '0.2085', 'num_input_tokens_seen': 16953254, 'train_runtime': '8576', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '2.162', 'learning_rate': '4.994e-05', 'epoch': '0.2086', 'num_input_tokens_seen': 16955301, 'train_runtime': '8577', 'train_tokens_per_second': '1977'} +{'loss': '0.4628', 'grad_norm': '1.059', 'learning_rate': '4.994e-05', 'epoch': '0.2086', 'num_input_tokens_seen': 16957348, 'train_runtime': '8578', 'train_tokens_per_second': '1977'} +{'loss': '0.5134', 'grad_norm': '1.19', 'learning_rate': '4.994e-05', 'epoch': '0.2086', 'num_input_tokens_seen': 16959395, 'train_runtime': '8579', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.699', 'learning_rate': '4.994e-05', 'epoch': '0.2086', 'num_input_tokens_seen': 16961442, 'train_runtime': '8580', 'train_tokens_per_second': '1977'} +{'loss': '0.582', 'grad_norm': '1.143', 'learning_rate': '4.994e-05', 'epoch': '0.2087', 'num_input_tokens_seen': 16963489, 'train_runtime': '8581', 'train_tokens_per_second': '1977'} +{'loss': '0.5391', 'grad_norm': '1.287', 'learning_rate': '4.994e-05', 'epoch': '0.2087', 'num_input_tokens_seen': 16965536, 'train_runtime': '8582', 'train_tokens_per_second': '1977'} +{'loss': '1.315', 'grad_norm': '1.923', 'learning_rate': '4.994e-05', 'epoch': '0.2087', 'num_input_tokens_seen': 16967583, 'train_runtime': '8583', 'train_tokens_per_second': '1977'} +{'loss': '0.6325', 'grad_norm': '1.278', 'learning_rate': '4.994e-05', 'epoch': '0.2087', 'num_input_tokens_seen': 16969630, 'train_runtime': '8584', 'train_tokens_per_second': '1977'} +{'loss': '1.159', 'grad_norm': '2.268', 'learning_rate': '4.994e-05', 'epoch': '0.2088', 'num_input_tokens_seen': 16971677, 'train_runtime': '8585', 'train_tokens_per_second': '1977'} +{'loss': '0.9874', 'grad_norm': '1.801', 'learning_rate': '4.994e-05', 'epoch': '0.2088', 'num_input_tokens_seen': 16973724, 'train_runtime': '8586', 'train_tokens_per_second': '1977'} +{'loss': '2.717', 'grad_norm': '2.674', 'learning_rate': '4.994e-05', 'epoch': '0.2088', 'num_input_tokens_seen': 16975771, 'train_runtime': '8587', 'train_tokens_per_second': '1977'} +{'loss': '0.4028', 'grad_norm': '0.9141', 'learning_rate': '4.994e-05', 'epoch': '0.2088', 'num_input_tokens_seen': 16977818, 'train_runtime': '8589', 'train_tokens_per_second': '1977'} +{'loss': '0.331', 'grad_norm': '0.7829', 'learning_rate': '4.994e-05', 'epoch': '0.2089', 'num_input_tokens_seen': 16979865, 'train_runtime': '8590', 'train_tokens_per_second': '1977'} +{'loss': '1.222', 'grad_norm': '1.951', 'learning_rate': '4.994e-05', 'epoch': '0.2089', 'num_input_tokens_seen': 16981912, 'train_runtime': '8591', 'train_tokens_per_second': '1977'} +{'loss': '1.516', 'grad_norm': '2.383', 'learning_rate': '4.994e-05', 'epoch': '0.2089', 'num_input_tokens_seen': 16983959, 'train_runtime': '8592', 'train_tokens_per_second': '1977'} +{'loss': '0.4075', 'grad_norm': '1.302', 'learning_rate': '4.994e-05', 'epoch': '0.2089', 'num_input_tokens_seen': 16986006, 'train_runtime': '8593', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '2.129', 'learning_rate': '4.994e-05', 'epoch': '0.209', 'num_input_tokens_seen': 16988053, 'train_runtime': '8594', 'train_tokens_per_second': '1977'} +{'loss': '1.916', 'grad_norm': '2.36', 'learning_rate': '4.994e-05', 'epoch': '0.209', 'num_input_tokens_seen': 16990100, 'train_runtime': '8595', 'train_tokens_per_second': '1977'} +{'loss': '0.9489', 'grad_norm': '1.853', 'learning_rate': '4.994e-05', 'epoch': '0.209', 'num_input_tokens_seen': 16992147, 'train_runtime': '8596', 'train_tokens_per_second': '1977'} +{'loss': '2.306', 'grad_norm': '2.651', 'learning_rate': '4.994e-05', 'epoch': '0.209', 'num_input_tokens_seen': 16994194, 'train_runtime': '8597', 'train_tokens_per_second': '1977'} +{'loss': '0.3669', 'grad_norm': '1.089', 'learning_rate': '4.994e-05', 'epoch': '0.2091', 'num_input_tokens_seen': 16996241, 'train_runtime': '8598', 'train_tokens_per_second': '1977'} +{'loss': '1.055', 'grad_norm': '1.5', 'learning_rate': '4.994e-05', 'epoch': '0.2091', 'num_input_tokens_seen': 16998288, 'train_runtime': '8599', 'train_tokens_per_second': '1977'} +{'loss': '1.779', 'grad_norm': '2.541', 'learning_rate': '4.994e-05', 'epoch': '0.2091', 'num_input_tokens_seen': 17000335, 'train_runtime': '8600', 'train_tokens_per_second': '1977'} +{'loss': '0.3783', 'grad_norm': '0.8633', 'learning_rate': '4.994e-05', 'epoch': '0.2091', 'num_input_tokens_seen': 17002382, 'train_runtime': '8601', 'train_tokens_per_second': '1977'} +{'loss': '0.3098', 'grad_norm': '0.8464', 'learning_rate': '4.994e-05', 'epoch': '0.2092', 'num_input_tokens_seen': 17004429, 'train_runtime': '8602', 'train_tokens_per_second': '1977'} +{'loss': '0.3631', 'grad_norm': '0.9119', 'learning_rate': '4.994e-05', 'epoch': '0.2092', 'num_input_tokens_seen': 17006476, 'train_runtime': '8603', 'train_tokens_per_second': '1977'} +{'loss': '0.5677', 'grad_norm': '1.116', 'learning_rate': '4.994e-05', 'epoch': '0.2092', 'num_input_tokens_seen': 17008523, 'train_runtime': '8604', 'train_tokens_per_second': '1977'} +{'loss': '0.7149', 'grad_norm': '1.223', 'learning_rate': '4.994e-05', 'epoch': '0.2092', 'num_input_tokens_seen': 17010570, 'train_runtime': '8605', 'train_tokens_per_second': '1977'} +{'loss': '0.9377', 'grad_norm': '1.99', 'learning_rate': '4.994e-05', 'epoch': '0.2093', 'num_input_tokens_seen': 17012617, 'train_runtime': '8606', 'train_tokens_per_second': '1977'} +{'loss': '0.3921', 'grad_norm': '1.279', 'learning_rate': '4.994e-05', 'epoch': '0.2093', 'num_input_tokens_seen': 17014664, 'train_runtime': '8607', 'train_tokens_per_second': '1977'} +{'loss': '0.7023', 'grad_norm': '1.387', 'learning_rate': '4.994e-05', 'epoch': '0.2093', 'num_input_tokens_seen': 17016711, 'train_runtime': '8608', 'train_tokens_per_second': '1977'} +{'loss': '1.023', 'grad_norm': '1.47', 'learning_rate': '4.994e-05', 'epoch': '0.2093', 'num_input_tokens_seen': 17018758, 'train_runtime': '8609', 'train_tokens_per_second': '1977'} +{'loss': '0.5601', 'grad_norm': '1.034', 'learning_rate': '4.994e-05', 'epoch': '0.2094', 'num_input_tokens_seen': 17020805, 'train_runtime': '8610', 'train_tokens_per_second': '1977'} +{'loss': '0.9265', 'grad_norm': '1.595', 'learning_rate': '4.994e-05', 'epoch': '0.2094', 'num_input_tokens_seen': 17022852, 'train_runtime': '8611', 'train_tokens_per_second': '1977'} +{'loss': '1.754', 'grad_norm': '2.427', 'learning_rate': '4.994e-05', 'epoch': '0.2094', 'num_input_tokens_seen': 17024899, 'train_runtime': '8612', 'train_tokens_per_second': '1977'} +{'loss': '1.016', 'grad_norm': '1.761', 'learning_rate': '4.994e-05', 'epoch': '0.2094', 'num_input_tokens_seen': 17026946, 'train_runtime': '8613', 'train_tokens_per_second': '1977'} +{'loss': '0.5975', 'grad_norm': '1.216', 'learning_rate': '4.994e-05', 'epoch': '0.2095', 'num_input_tokens_seen': 17028993, 'train_runtime': '8614', 'train_tokens_per_second': '1977'} +{'loss': '1.311', 'grad_norm': '1.84', 'learning_rate': '4.994e-05', 'epoch': '0.2095', 'num_input_tokens_seen': 17031040, 'train_runtime': '8615', 'train_tokens_per_second': '1977'} +{'loss': '0.2133', 'grad_norm': '0.8444', 'learning_rate': '4.994e-05', 'epoch': '0.2095', 'num_input_tokens_seen': 17033087, 'train_runtime': '8616', 'train_tokens_per_second': '1977'} +{'loss': '1.25', 'grad_norm': '1.218', 'learning_rate': '4.994e-05', 'epoch': '0.2095', 'num_input_tokens_seen': 17035134, 'train_runtime': '8617', 'train_tokens_per_second': '1977'} +{'loss': '0.7908', 'grad_norm': '1.092', 'learning_rate': '4.994e-05', 'epoch': '0.2096', 'num_input_tokens_seen': 17037181, 'train_runtime': '8618', 'train_tokens_per_second': '1977'} +{'loss': '0.9524', 'grad_norm': '1.287', 'learning_rate': '4.994e-05', 'epoch': '0.2096', 'num_input_tokens_seen': 17039228, 'train_runtime': '8620', 'train_tokens_per_second': '1977'} +{'loss': '0.7927', 'grad_norm': '0.9822', 'learning_rate': '4.994e-05', 'epoch': '0.2096', 'num_input_tokens_seen': 17041275, 'train_runtime': '8621', 'train_tokens_per_second': '1977'} +{'loss': '0.2366', 'grad_norm': '1.026', 'learning_rate': '4.994e-05', 'epoch': '0.2096', 'num_input_tokens_seen': 17043322, 'train_runtime': '8622', 'train_tokens_per_second': '1977'} +{'loss': '0.5857', 'grad_norm': '0.9439', 'learning_rate': '4.994e-05', 'epoch': '0.2097', 'num_input_tokens_seen': 17045369, 'train_runtime': '8623', 'train_tokens_per_second': '1977'} +{'loss': '0.3394', 'grad_norm': '1.082', 'learning_rate': '4.994e-05', 'epoch': '0.2097', 'num_input_tokens_seen': 17047416, 'train_runtime': '8624', 'train_tokens_per_second': '1977'} +{'loss': '0.8739', 'grad_norm': '1.387', 'learning_rate': '4.994e-05', 'epoch': '0.2097', 'num_input_tokens_seen': 17049463, 'train_runtime': '8625', 'train_tokens_per_second': '1977'} +{'loss': '0.9531', 'grad_norm': '1.456', 'learning_rate': '4.994e-05', 'epoch': '0.2097', 'num_input_tokens_seen': 17051510, 'train_runtime': '8626', 'train_tokens_per_second': '1977'} +{'loss': '1.035', 'grad_norm': '1.384', 'learning_rate': '4.994e-05', 'epoch': '0.2098', 'num_input_tokens_seen': 17053557, 'train_runtime': '8627', 'train_tokens_per_second': '1977'} +{'loss': '1.098', 'grad_norm': '1.666', 'learning_rate': '4.994e-05', 'epoch': '0.2098', 'num_input_tokens_seen': 17055604, 'train_runtime': '8628', 'train_tokens_per_second': '1977'} +{'loss': '1.238', 'grad_norm': '1.92', 'learning_rate': '4.994e-05', 'epoch': '0.2098', 'num_input_tokens_seen': 17057651, 'train_runtime': '8629', 'train_tokens_per_second': '1977'} +{'loss': '0.6544', 'grad_norm': '1.444', 'learning_rate': '4.994e-05', 'epoch': '0.2098', 'num_input_tokens_seen': 17059698, 'train_runtime': '8630', 'train_tokens_per_second': '1977'} +{'loss': '0.611', 'grad_norm': '1.342', 'learning_rate': '4.994e-05', 'epoch': '0.2099', 'num_input_tokens_seen': 17061745, 'train_runtime': '8631', 'train_tokens_per_second': '1977'} +{'loss': '0.9543', 'grad_norm': '1.463', 'learning_rate': '4.994e-05', 'epoch': '0.2099', 'num_input_tokens_seen': 17063792, 'train_runtime': '8632', 'train_tokens_per_second': '1977'} +{'loss': '0.6226', 'grad_norm': '1.202', 'learning_rate': '4.994e-05', 'epoch': '0.2099', 'num_input_tokens_seen': 17065839, 'train_runtime': '8633', 'train_tokens_per_second': '1977'} +{'loss': '0.5568', 'grad_norm': '1.282', 'learning_rate': '4.994e-05', 'epoch': '0.2099', 'num_input_tokens_seen': 17067886, 'train_runtime': '8634', 'train_tokens_per_second': '1977'} +{'loss': '0.7427', 'grad_norm': '1.437', 'learning_rate': '4.994e-05', 'epoch': '0.21', 'num_input_tokens_seen': 17069933, 'train_runtime': '8635', 'train_tokens_per_second': '1977'} +{'loss': '0.958', 'grad_norm': '1.745', 'learning_rate': '4.994e-05', 'epoch': '0.21', 'num_input_tokens_seen': 17071980, 'train_runtime': '8636', 'train_tokens_per_second': '1977'} +{'loss': '1.875', 'grad_norm': '2.98', 'learning_rate': '4.994e-05', 'epoch': '0.21', 'num_input_tokens_seen': 17074027, 'train_runtime': '8637', 'train_tokens_per_second': '1977'} +{'loss': '0.6395', 'grad_norm': '1.147', 'learning_rate': '4.994e-05', 'epoch': '0.21', 'num_input_tokens_seen': 17076074, 'train_runtime': '8638', 'train_tokens_per_second': '1977'} +{'loss': '1.252', 'grad_norm': '1.569', 'learning_rate': '4.994e-05', 'epoch': '0.2101', 'num_input_tokens_seen': 17078121, 'train_runtime': '8639', 'train_tokens_per_second': '1977'} +{'loss': '0.6417', 'grad_norm': '1.169', 'learning_rate': '4.994e-05', 'epoch': '0.2101', 'num_input_tokens_seen': 17080168, 'train_runtime': '8640', 'train_tokens_per_second': '1977'} +{'loss': '0.423', 'grad_norm': '0.8535', 'learning_rate': '4.994e-05', 'epoch': '0.2101', 'num_input_tokens_seen': 17082215, 'train_runtime': '8641', 'train_tokens_per_second': '1977'} +{'loss': '0.2929', 'grad_norm': '1.004', 'learning_rate': '4.994e-05', 'epoch': '0.2101', 'num_input_tokens_seen': 17084262, 'train_runtime': '8642', 'train_tokens_per_second': '1977'} +{'loss': '0.301', 'grad_norm': '0.8437', 'learning_rate': '4.994e-05', 'epoch': '0.2102', 'num_input_tokens_seen': 17086309, 'train_runtime': '8643', 'train_tokens_per_second': '1977'} +{'loss': '0.6891', 'grad_norm': '1.114', 'learning_rate': '4.994e-05', 'epoch': '0.2102', 'num_input_tokens_seen': 17088356, 'train_runtime': '8644', 'train_tokens_per_second': '1977'} +{'loss': '1.519', 'grad_norm': '2.299', 'learning_rate': '4.994e-05', 'epoch': '0.2102', 'num_input_tokens_seen': 17090403, 'train_runtime': '8645', 'train_tokens_per_second': '1977'} +{'loss': '0.4698', 'grad_norm': '1.155', 'learning_rate': '4.994e-05', 'epoch': '0.2102', 'num_input_tokens_seen': 17092450, 'train_runtime': '8646', 'train_tokens_per_second': '1977'} +{'loss': '0.3824', 'grad_norm': '0.9633', 'learning_rate': '4.994e-05', 'epoch': '0.2103', 'num_input_tokens_seen': 17094497, 'train_runtime': '8647', 'train_tokens_per_second': '1977'} +{'loss': '0.3619', 'grad_norm': '0.917', 'learning_rate': '4.994e-05', 'epoch': '0.2103', 'num_input_tokens_seen': 17096544, 'train_runtime': '8648', 'train_tokens_per_second': '1977'} +{'loss': '0.921', 'grad_norm': '1.947', 'learning_rate': '4.994e-05', 'epoch': '0.2103', 'num_input_tokens_seen': 17098591, 'train_runtime': '8649', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '2.431', 'learning_rate': '4.994e-05', 'epoch': '0.2103', 'num_input_tokens_seen': 17100638, 'train_runtime': '8651', 'train_tokens_per_second': '1977'} +{'loss': '0.3589', 'grad_norm': '1.051', 'learning_rate': '4.994e-05', 'epoch': '0.2104', 'num_input_tokens_seen': 17102685, 'train_runtime': '8652', 'train_tokens_per_second': '1977'} +{'loss': '0.7776', 'grad_norm': '1.515', 'learning_rate': '4.994e-05', 'epoch': '0.2104', 'num_input_tokens_seen': 17104732, 'train_runtime': '8653', 'train_tokens_per_second': '1977'} +{'loss': '0.8199', 'grad_norm': '1.408', 'learning_rate': '4.994e-05', 'epoch': '0.2104', 'num_input_tokens_seen': 17106779, 'train_runtime': '8654', 'train_tokens_per_second': '1977'} +{'loss': '0.4047', 'grad_norm': '1.026', 'learning_rate': '4.994e-05', 'epoch': '0.2104', 'num_input_tokens_seen': 17108826, 'train_runtime': '8655', 'train_tokens_per_second': '1977'} +{'loss': '0.7838', 'grad_norm': '1.201', 'learning_rate': '4.994e-05', 'epoch': '0.2105', 'num_input_tokens_seen': 17110873, 'train_runtime': '8656', 'train_tokens_per_second': '1977'} +{'loss': '0.3482', 'grad_norm': '1.222', 'learning_rate': '4.994e-05', 'epoch': '0.2105', 'num_input_tokens_seen': 17112920, 'train_runtime': '8657', 'train_tokens_per_second': '1977'} +{'loss': '0.2835', 'grad_norm': '1.225', 'learning_rate': '4.994e-05', 'epoch': '0.2105', 'num_input_tokens_seen': 17114967, 'train_runtime': '8658', 'train_tokens_per_second': '1977'} +{'loss': '0.8641', 'grad_norm': '1.361', 'learning_rate': '4.994e-05', 'epoch': '0.2105', 'num_input_tokens_seen': 17117014, 'train_runtime': '8659', 'train_tokens_per_second': '1977'} +{'loss': '0.4622', 'grad_norm': '1.077', 'learning_rate': '4.994e-05', 'epoch': '0.2106', 'num_input_tokens_seen': 17119061, 'train_runtime': '8660', 'train_tokens_per_second': '1977'} +{'loss': '1.483', 'grad_norm': '2.196', 'learning_rate': '4.994e-05', 'epoch': '0.2106', 'num_input_tokens_seen': 17121108, 'train_runtime': '8661', 'train_tokens_per_second': '1977'} +{'loss': '0.4152', 'grad_norm': '0.8614', 'learning_rate': '4.994e-05', 'epoch': '0.2106', 'num_input_tokens_seen': 17123155, 'train_runtime': '8662', 'train_tokens_per_second': '1977'} +{'loss': '0.3392', 'grad_norm': '1.05', 'learning_rate': '4.994e-05', 'epoch': '0.2106', 'num_input_tokens_seen': 17125202, 'train_runtime': '8663', 'train_tokens_per_second': '1977'} +{'loss': '2.134', 'grad_norm': '2.438', 'learning_rate': '4.994e-05', 'epoch': '0.2107', 'num_input_tokens_seen': 17127249, 'train_runtime': '8664', 'train_tokens_per_second': '1977'} +{'loss': '1.181', 'grad_norm': '1.854', 'learning_rate': '4.994e-05', 'epoch': '0.2107', 'num_input_tokens_seen': 17129296, 'train_runtime': '8665', 'train_tokens_per_second': '1977'} +{'loss': '0.3779', 'grad_norm': '1.134', 'learning_rate': '4.994e-05', 'epoch': '0.2107', 'num_input_tokens_seen': 17131343, 'train_runtime': '8666', 'train_tokens_per_second': '1977'} +{'loss': '0.8799', 'grad_norm': '1.547', 'learning_rate': '4.994e-05', 'epoch': '0.2107', 'num_input_tokens_seen': 17133390, 'train_runtime': '8667', 'train_tokens_per_second': '1977'} +{'loss': '0.849', 'grad_norm': '1.49', 'learning_rate': '4.994e-05', 'epoch': '0.2108', 'num_input_tokens_seen': 17135437, 'train_runtime': '8668', 'train_tokens_per_second': '1977'} +{'loss': '0.1837', 'grad_norm': '0.9623', 'learning_rate': '4.994e-05', 'epoch': '0.2108', 'num_input_tokens_seen': 17137484, 'train_runtime': '8669', 'train_tokens_per_second': '1977'} +{'loss': '1.866', 'grad_norm': '2.688', 'learning_rate': '4.994e-05', 'epoch': '0.2108', 'num_input_tokens_seen': 17139531, 'train_runtime': '8670', 'train_tokens_per_second': '1977'} +{'loss': '0.5481', 'grad_norm': '1.243', 'learning_rate': '4.994e-05', 'epoch': '0.2108', 'num_input_tokens_seen': 17141578, 'train_runtime': '8671', 'train_tokens_per_second': '1977'} +{'loss': '1.045', 'grad_norm': '1.469', 'learning_rate': '4.994e-05', 'epoch': '0.2109', 'num_input_tokens_seen': 17143625, 'train_runtime': '8672', 'train_tokens_per_second': '1977'} +{'loss': '0.5774', 'grad_norm': '1.297', 'learning_rate': '4.994e-05', 'epoch': '0.2109', 'num_input_tokens_seen': 17145672, 'train_runtime': '8673', 'train_tokens_per_second': '1977'} +{'loss': '0.4787', 'grad_norm': '1.15', 'learning_rate': '4.994e-05', 'epoch': '0.2109', 'num_input_tokens_seen': 17147719, 'train_runtime': '8674', 'train_tokens_per_second': '1977'} +{'loss': '0.3166', 'grad_norm': '0.795', 'learning_rate': '4.994e-05', 'epoch': '0.2109', 'num_input_tokens_seen': 17149766, 'train_runtime': '8675', 'train_tokens_per_second': '1977'} +{'loss': '0.3974', 'grad_norm': '0.833', 'learning_rate': '4.994e-05', 'epoch': '0.211', 'num_input_tokens_seen': 17151813, 'train_runtime': '8676', 'train_tokens_per_second': '1977'} +{'loss': '0.7157', 'grad_norm': '1.366', 'learning_rate': '4.994e-05', 'epoch': '0.211', 'num_input_tokens_seen': 17153860, 'train_runtime': '8677', 'train_tokens_per_second': '1977'} +{'loss': '1.063', 'grad_norm': '1.611', 'learning_rate': '4.994e-05', 'epoch': '0.211', 'num_input_tokens_seen': 17155907, 'train_runtime': '8678', 'train_tokens_per_second': '1977'} +{'loss': '1.536', 'grad_norm': '2.415', 'learning_rate': '4.994e-05', 'epoch': '0.211', 'num_input_tokens_seen': 17157954, 'train_runtime': '8680', 'train_tokens_per_second': '1977'} +{'loss': '0.6868', 'grad_norm': '1.143', 'learning_rate': '4.994e-05', 'epoch': '0.2111', 'num_input_tokens_seen': 17160001, 'train_runtime': '8681', 'train_tokens_per_second': '1977'} +{'loss': '0.6566', 'grad_norm': '1.445', 'learning_rate': '4.994e-05', 'epoch': '0.2111', 'num_input_tokens_seen': 17162048, 'train_runtime': '8682', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '1.898', 'learning_rate': '4.994e-05', 'epoch': '0.2111', 'num_input_tokens_seen': 17164095, 'train_runtime': '8683', 'train_tokens_per_second': '1977'} +{'loss': '1.096', 'grad_norm': '1.524', 'learning_rate': '4.994e-05', 'epoch': '0.2111', 'num_input_tokens_seen': 17166142, 'train_runtime': '8684', 'train_tokens_per_second': '1977'} +{'loss': '0.3246', 'grad_norm': '0.8605', 'learning_rate': '4.994e-05', 'epoch': '0.2112', 'num_input_tokens_seen': 17168189, 'train_runtime': '8685', 'train_tokens_per_second': '1977'} +{'loss': '0.5624', 'grad_norm': '1.098', 'learning_rate': '4.994e-05', 'epoch': '0.2112', 'num_input_tokens_seen': 17170236, 'train_runtime': '8686', 'train_tokens_per_second': '1977'} +{'loss': '0.8585', 'grad_norm': '1.224', 'learning_rate': '4.994e-05', 'epoch': '0.2112', 'num_input_tokens_seen': 17172283, 'train_runtime': '8687', 'train_tokens_per_second': '1977'} +{'loss': '1.324', 'grad_norm': '2.256', 'learning_rate': '4.994e-05', 'epoch': '0.2112', 'num_input_tokens_seen': 17174330, 'train_runtime': '8688', 'train_tokens_per_second': '1977'} +{'loss': '0.7701', 'grad_norm': '1.146', 'learning_rate': '4.994e-05', 'epoch': '0.2113', 'num_input_tokens_seen': 17176377, 'train_runtime': '8689', 'train_tokens_per_second': '1977'} +{'loss': '0.4828', 'grad_norm': '1.121', 'learning_rate': '4.994e-05', 'epoch': '0.2113', 'num_input_tokens_seen': 17178424, 'train_runtime': '8690', 'train_tokens_per_second': '1977'} +{'loss': '0.7889', 'grad_norm': '1.144', 'learning_rate': '4.994e-05', 'epoch': '0.2113', 'num_input_tokens_seen': 17180471, 'train_runtime': '8691', 'train_tokens_per_second': '1977'} +{'loss': '0.29', 'grad_norm': '1.092', 'learning_rate': '4.994e-05', 'epoch': '0.2113', 'num_input_tokens_seen': 17182518, 'train_runtime': '8692', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.368', 'learning_rate': '4.994e-05', 'epoch': '0.2114', 'num_input_tokens_seen': 17184565, 'train_runtime': '8693', 'train_tokens_per_second': '1977'} +{'loss': '1.998', 'grad_norm': '2.656', 'learning_rate': '4.994e-05', 'epoch': '0.2114', 'num_input_tokens_seen': 17186612, 'train_runtime': '8694', 'train_tokens_per_second': '1977'} +{'loss': '0.9099', 'grad_norm': '1.334', 'learning_rate': '4.994e-05', 'epoch': '0.2114', 'num_input_tokens_seen': 17188659, 'train_runtime': '8695', 'train_tokens_per_second': '1977'} +{'loss': '0.6978', 'grad_norm': '1.269', 'learning_rate': '4.994e-05', 'epoch': '0.2114', 'num_input_tokens_seen': 17190706, 'train_runtime': '8696', 'train_tokens_per_second': '1977'} +{'loss': '0.342', 'grad_norm': '0.9476', 'learning_rate': '4.994e-05', 'epoch': '0.2115', 'num_input_tokens_seen': 17192753, 'train_runtime': '8697', 'train_tokens_per_second': '1977'} +{'loss': '0.7411', 'grad_norm': '1.633', 'learning_rate': '4.994e-05', 'epoch': '0.2115', 'num_input_tokens_seen': 17194800, 'train_runtime': '8698', 'train_tokens_per_second': '1977'} +{'loss': '0.7098', 'grad_norm': '1.651', 'learning_rate': '4.994e-05', 'epoch': '0.2115', 'num_input_tokens_seen': 17196847, 'train_runtime': '8699', 'train_tokens_per_second': '1977'} +{'loss': '0.6726', 'grad_norm': '1.583', 'learning_rate': '4.994e-05', 'epoch': '0.2115', 'num_input_tokens_seen': 17198894, 'train_runtime': '8700', 'train_tokens_per_second': '1977'} +{'loss': '0.8353', 'grad_norm': '1.174', 'learning_rate': '4.994e-05', 'epoch': '0.2116', 'num_input_tokens_seen': 17200941, 'train_runtime': '8701', 'train_tokens_per_second': '1977'} +{'loss': '0.7762', 'grad_norm': '1.427', 'learning_rate': '4.994e-05', 'epoch': '0.2116', 'num_input_tokens_seen': 17202988, 'train_runtime': '8702', 'train_tokens_per_second': '1977'} +{'loss': '0.9228', 'grad_norm': '1.385', 'learning_rate': '4.994e-05', 'epoch': '0.2116', 'num_input_tokens_seen': 17205035, 'train_runtime': '8703', 'train_tokens_per_second': '1977'} +{'loss': '0.3481', 'grad_norm': '1.014', 'learning_rate': '4.994e-05', 'epoch': '0.2116', 'num_input_tokens_seen': 17207082, 'train_runtime': '8704', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '1.621', 'learning_rate': '4.994e-05', 'epoch': '0.2117', 'num_input_tokens_seen': 17209129, 'train_runtime': '8705', 'train_tokens_per_second': '1977'} +{'loss': '0.5285', 'grad_norm': '1.027', 'learning_rate': '4.994e-05', 'epoch': '0.2117', 'num_input_tokens_seen': 17211176, 'train_runtime': '8706', 'train_tokens_per_second': '1977'} +{'loss': '0.3442', 'grad_norm': '1.169', 'learning_rate': '4.994e-05', 'epoch': '0.2117', 'num_input_tokens_seen': 17213223, 'train_runtime': '8707', 'train_tokens_per_second': '1977'} +{'loss': '0.9949', 'grad_norm': '1.811', 'learning_rate': '4.994e-05', 'epoch': '0.2117', 'num_input_tokens_seen': 17215270, 'train_runtime': '8709', 'train_tokens_per_second': '1977'} +{'loss': '1.143', 'grad_norm': '1.277', 'learning_rate': '4.994e-05', 'epoch': '0.2118', 'num_input_tokens_seen': 17217317, 'train_runtime': '8710', 'train_tokens_per_second': '1977'} +{'loss': '0.2528', 'grad_norm': '0.8107', 'learning_rate': '4.994e-05', 'epoch': '0.2118', 'num_input_tokens_seen': 17219364, 'train_runtime': '8711', 'train_tokens_per_second': '1977'} +{'loss': '1.179', 'grad_norm': '2.074', 'learning_rate': '4.994e-05', 'epoch': '0.2118', 'num_input_tokens_seen': 17221411, 'train_runtime': '8712', 'train_tokens_per_second': '1977'} +{'loss': '0.5664', 'grad_norm': '1.496', 'learning_rate': '4.994e-05', 'epoch': '0.2118', 'num_input_tokens_seen': 17223458, 'train_runtime': '8713', 'train_tokens_per_second': '1977'} +{'loss': '0.7979', 'grad_norm': '1.708', 'learning_rate': '4.994e-05', 'epoch': '0.2119', 'num_input_tokens_seen': 17225505, 'train_runtime': '8714', 'train_tokens_per_second': '1977'} +{'loss': '0.9169', 'grad_norm': '1.348', 'learning_rate': '4.994e-05', 'epoch': '0.2119', 'num_input_tokens_seen': 17227552, 'train_runtime': '8715', 'train_tokens_per_second': '1977'} +{'loss': '0.7351', 'grad_norm': '1.208', 'learning_rate': '4.994e-05', 'epoch': '0.2119', 'num_input_tokens_seen': 17229599, 'train_runtime': '8716', 'train_tokens_per_second': '1977'} +{'loss': '0.5153', 'grad_norm': '1.313', 'learning_rate': '4.994e-05', 'epoch': '0.2119', 'num_input_tokens_seen': 17231646, 'train_runtime': '8717', 'train_tokens_per_second': '1977'} +{'loss': '0.2771', 'grad_norm': '1.744', 'learning_rate': '4.994e-05', 'epoch': '0.212', 'num_input_tokens_seen': 17233693, 'train_runtime': '8718', 'train_tokens_per_second': '1977'} +{'loss': '0.3028', 'grad_norm': '0.8224', 'learning_rate': '4.994e-05', 'epoch': '0.212', 'num_input_tokens_seen': 17235740, 'train_runtime': '8719', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.641', 'learning_rate': '4.994e-05', 'epoch': '0.212', 'num_input_tokens_seen': 17237787, 'train_runtime': '8720', 'train_tokens_per_second': '1977'} +{'loss': '0.3317', 'grad_norm': '1.106', 'learning_rate': '4.994e-05', 'epoch': '0.2121', 'num_input_tokens_seen': 17239834, 'train_runtime': '8721', 'train_tokens_per_second': '1977'} +{'loss': '0.5061', 'grad_norm': '1.188', 'learning_rate': '4.994e-05', 'epoch': '0.2121', 'num_input_tokens_seen': 17241881, 'train_runtime': '8722', 'train_tokens_per_second': '1977'} +{'loss': '0.5927', 'grad_norm': '1.097', 'learning_rate': '4.994e-05', 'epoch': '0.2121', 'num_input_tokens_seen': 17243928, 'train_runtime': '8723', 'train_tokens_per_second': '1977'} +{'loss': '0.3613', 'grad_norm': '0.9325', 'learning_rate': '4.994e-05', 'epoch': '0.2121', 'num_input_tokens_seen': 17245975, 'train_runtime': '8724', 'train_tokens_per_second': '1977'} +{'loss': '0.7252', 'grad_norm': '1.457', 'learning_rate': '4.994e-05', 'epoch': '0.2122', 'num_input_tokens_seen': 17248022, 'train_runtime': '8725', 'train_tokens_per_second': '1977'} +{'loss': '0.3469', 'grad_norm': '1.033', 'learning_rate': '4.994e-05', 'epoch': '0.2122', 'num_input_tokens_seen': 17250069, 'train_runtime': '8726', 'train_tokens_per_second': '1977'} +{'loss': '0.7458', 'grad_norm': '1.455', 'learning_rate': '4.994e-05', 'epoch': '0.2122', 'num_input_tokens_seen': 17252116, 'train_runtime': '8727', 'train_tokens_per_second': '1977'} +{'loss': '1.312', 'grad_norm': '2.237', 'learning_rate': '4.994e-05', 'epoch': '0.2122', 'num_input_tokens_seen': 17254163, 'train_runtime': '8728', 'train_tokens_per_second': '1977'} +{'loss': '0.3983', 'grad_norm': '1.056', 'learning_rate': '4.994e-05', 'epoch': '0.2123', 'num_input_tokens_seen': 17256210, 'train_runtime': '8729', 'train_tokens_per_second': '1977'} +{'loss': '0.7766', 'grad_norm': '1.407', 'learning_rate': '4.994e-05', 'epoch': '0.2123', 'num_input_tokens_seen': 17258257, 'train_runtime': '8730', 'train_tokens_per_second': '1977'} +{'loss': '0.746', 'grad_norm': '1.206', 'learning_rate': '4.994e-05', 'epoch': '0.2123', 'num_input_tokens_seen': 17260304, 'train_runtime': '8731', 'train_tokens_per_second': '1977'} +{'loss': '0.6347', 'grad_norm': '1.111', 'learning_rate': '4.994e-05', 'epoch': '0.2123', 'num_input_tokens_seen': 17262351, 'train_runtime': '8732', 'train_tokens_per_second': '1977'} +{'loss': '0.5921', 'grad_norm': '1.436', 'learning_rate': '4.994e-05', 'epoch': '0.2124', 'num_input_tokens_seen': 17264398, 'train_runtime': '8733', 'train_tokens_per_second': '1977'} +{'loss': '0.5706', 'grad_norm': '1.389', 'learning_rate': '4.994e-05', 'epoch': '0.2124', 'num_input_tokens_seen': 17266445, 'train_runtime': '8734', 'train_tokens_per_second': '1977'} +{'loss': '1.039', 'grad_norm': '1.572', 'learning_rate': '4.994e-05', 'epoch': '0.2124', 'num_input_tokens_seen': 17268492, 'train_runtime': '8735', 'train_tokens_per_second': '1977'} +{'loss': '0.7395', 'grad_norm': '1.261', 'learning_rate': '4.994e-05', 'epoch': '0.2124', 'num_input_tokens_seen': 17270539, 'train_runtime': '8737', 'train_tokens_per_second': '1977'} +{'loss': '0.7566', 'grad_norm': '1.447', 'learning_rate': '4.994e-05', 'epoch': '0.2125', 'num_input_tokens_seen': 17272586, 'train_runtime': '8738', 'train_tokens_per_second': '1977'} +{'loss': '0.2738', 'grad_norm': '0.747', 'learning_rate': '4.994e-05', 'epoch': '0.2125', 'num_input_tokens_seen': 17274633, 'train_runtime': '8739', 'train_tokens_per_second': '1977'} +{'loss': '0.8999', 'grad_norm': '1.282', 'learning_rate': '4.994e-05', 'epoch': '0.2125', 'num_input_tokens_seen': 17276680, 'train_runtime': '8740', 'train_tokens_per_second': '1977'} +{'loss': '0.9122', 'grad_norm': '1.334', 'learning_rate': '4.994e-05', 'epoch': '0.2125', 'num_input_tokens_seen': 17278727, 'train_runtime': '8741', 'train_tokens_per_second': '1977'} +{'loss': '0.2179', 'grad_norm': '0.7798', 'learning_rate': '4.993e-05', 'epoch': '0.2126', 'num_input_tokens_seen': 17280774, 'train_runtime': '8742', 'train_tokens_per_second': '1977'} +{'loss': '0.1851', 'grad_norm': '0.8808', 'learning_rate': '4.993e-05', 'epoch': '0.2126', 'num_input_tokens_seen': 17282821, 'train_runtime': '8743', 'train_tokens_per_second': '1977'} +{'loss': '0.2697', 'grad_norm': '0.8015', 'learning_rate': '4.993e-05', 'epoch': '0.2126', 'num_input_tokens_seen': 17284868, 'train_runtime': '8744', 'train_tokens_per_second': '1977'} +{'loss': '0.4624', 'grad_norm': '1.051', 'learning_rate': '4.993e-05', 'epoch': '0.2126', 'num_input_tokens_seen': 17286915, 'train_runtime': '8745', 'train_tokens_per_second': '1977'} +{'loss': '0.8628', 'grad_norm': '1.432', 'learning_rate': '4.993e-05', 'epoch': '0.2127', 'num_input_tokens_seen': 17288962, 'train_runtime': '8746', 'train_tokens_per_second': '1977'} +{'loss': '0.2636', 'grad_norm': '0.9611', 'learning_rate': '4.993e-05', 'epoch': '0.2127', 'num_input_tokens_seen': 17291009, 'train_runtime': '8747', 'train_tokens_per_second': '1977'} +{'loss': '0.3885', 'grad_norm': '0.9635', 'learning_rate': '4.993e-05', 'epoch': '0.2127', 'num_input_tokens_seen': 17293056, 'train_runtime': '8748', 'train_tokens_per_second': '1977'} +{'loss': '0.594', 'grad_norm': '1.173', 'learning_rate': '4.993e-05', 'epoch': '0.2127', 'num_input_tokens_seen': 17295103, 'train_runtime': '8749', 'train_tokens_per_second': '1977'} +{'loss': '2.173', 'grad_norm': '2.514', 'learning_rate': '4.993e-05', 'epoch': '0.2128', 'num_input_tokens_seen': 17297150, 'train_runtime': '8750', 'train_tokens_per_second': '1977'} +{'loss': '0.4644', 'grad_norm': '1.132', 'learning_rate': '4.993e-05', 'epoch': '0.2128', 'num_input_tokens_seen': 17299197, 'train_runtime': '8751', 'train_tokens_per_second': '1977'} +{'loss': '0.6124', 'grad_norm': '0.9098', 'learning_rate': '4.993e-05', 'epoch': '0.2128', 'num_input_tokens_seen': 17301244, 'train_runtime': '8752', 'train_tokens_per_second': '1977'} +{'loss': '0.258', 'grad_norm': '0.9681', 'learning_rate': '4.993e-05', 'epoch': '0.2128', 'num_input_tokens_seen': 17303291, 'train_runtime': '8753', 'train_tokens_per_second': '1977'} +{'loss': '2.833', 'grad_norm': '2.723', 'learning_rate': '4.993e-05', 'epoch': '0.2129', 'num_input_tokens_seen': 17305338, 'train_runtime': '8754', 'train_tokens_per_second': '1977'} +{'loss': '0.8362', 'grad_norm': '1.621', 'learning_rate': '4.993e-05', 'epoch': '0.2129', 'num_input_tokens_seen': 17307385, 'train_runtime': '8755', 'train_tokens_per_second': '1977'} +{'loss': '1.763', 'grad_norm': '2.469', 'learning_rate': '4.993e-05', 'epoch': '0.2129', 'num_input_tokens_seen': 17309432, 'train_runtime': '8756', 'train_tokens_per_second': '1977'} +{'loss': '0.2647', 'grad_norm': '0.8029', 'learning_rate': '4.993e-05', 'epoch': '0.2129', 'num_input_tokens_seen': 17311479, 'train_runtime': '8757', 'train_tokens_per_second': '1977'} +{'loss': '0.5233', 'grad_norm': '1.343', 'learning_rate': '4.993e-05', 'epoch': '0.213', 'num_input_tokens_seen': 17313526, 'train_runtime': '8758', 'train_tokens_per_second': '1977'} +{'loss': '0.8812', 'grad_norm': '1.242', 'learning_rate': '4.993e-05', 'epoch': '0.213', 'num_input_tokens_seen': 17315573, 'train_runtime': '8759', 'train_tokens_per_second': '1977'} +{'loss': '0.4559', 'grad_norm': '1.201', 'learning_rate': '4.993e-05', 'epoch': '0.213', 'num_input_tokens_seen': 17317620, 'train_runtime': '8760', 'train_tokens_per_second': '1977'} +{'loss': '0.3839', 'grad_norm': '1.21', 'learning_rate': '4.993e-05', 'epoch': '0.213', 'num_input_tokens_seen': 17319667, 'train_runtime': '8761', 'train_tokens_per_second': '1977'} +{'loss': '0.5778', 'grad_norm': '1.144', 'learning_rate': '4.993e-05', 'epoch': '0.2131', 'num_input_tokens_seen': 17321714, 'train_runtime': '8762', 'train_tokens_per_second': '1977'} +{'loss': '0.4265', 'grad_norm': '0.9633', 'learning_rate': '4.993e-05', 'epoch': '0.2131', 'num_input_tokens_seen': 17323761, 'train_runtime': '8764', 'train_tokens_per_second': '1977'} +{'loss': '1.347', 'grad_norm': '2.31', 'learning_rate': '4.993e-05', 'epoch': '0.2131', 'num_input_tokens_seen': 17325808, 'train_runtime': '8765', 'train_tokens_per_second': '1977'} +{'loss': '0.3728', 'grad_norm': '0.947', 'learning_rate': '4.993e-05', 'epoch': '0.2131', 'num_input_tokens_seen': 17327855, 'train_runtime': '8766', 'train_tokens_per_second': '1977'} +{'loss': '0.4612', 'grad_norm': '1.137', 'learning_rate': '4.993e-05', 'epoch': '0.2132', 'num_input_tokens_seen': 17329902, 'train_runtime': '8767', 'train_tokens_per_second': '1977'} +{'loss': '0.2937', 'grad_norm': '0.987', 'learning_rate': '4.993e-05', 'epoch': '0.2132', 'num_input_tokens_seen': 17331949, 'train_runtime': '8768', 'train_tokens_per_second': '1977'} +{'loss': '0.6185', 'grad_norm': '1.171', 'learning_rate': '4.993e-05', 'epoch': '0.2132', 'num_input_tokens_seen': 17333996, 'train_runtime': '8769', 'train_tokens_per_second': '1977'} +{'loss': '0.7105', 'grad_norm': '0.9484', 'learning_rate': '4.993e-05', 'epoch': '0.2132', 'num_input_tokens_seen': 17336043, 'train_runtime': '8770', 'train_tokens_per_second': '1977'} +{'loss': '0.489', 'grad_norm': '0.8403', 'learning_rate': '4.993e-05', 'epoch': '0.2133', 'num_input_tokens_seen': 17338090, 'train_runtime': '8771', 'train_tokens_per_second': '1977'} +{'loss': '0.374', 'grad_norm': '1.046', 'learning_rate': '4.993e-05', 'epoch': '0.2133', 'num_input_tokens_seen': 17340137, 'train_runtime': '8772', 'train_tokens_per_second': '1977'} +{'loss': '0.6027', 'grad_norm': '1.502', 'learning_rate': '4.993e-05', 'epoch': '0.2133', 'num_input_tokens_seen': 17342184, 'train_runtime': '8773', 'train_tokens_per_second': '1977'} +{'loss': '0.7573', 'grad_norm': '1.598', 'learning_rate': '4.993e-05', 'epoch': '0.2133', 'num_input_tokens_seen': 17344231, 'train_runtime': '8774', 'train_tokens_per_second': '1977'} +{'loss': '0.6216', 'grad_norm': '1.41', 'learning_rate': '4.993e-05', 'epoch': '0.2134', 'num_input_tokens_seen': 17346278, 'train_runtime': '8775', 'train_tokens_per_second': '1977'} +{'loss': '0.5036', 'grad_norm': '1.036', 'learning_rate': '4.993e-05', 'epoch': '0.2134', 'num_input_tokens_seen': 17348325, 'train_runtime': '8776', 'train_tokens_per_second': '1977'} +{'loss': '0.4034', 'grad_norm': '0.9015', 'learning_rate': '4.993e-05', 'epoch': '0.2134', 'num_input_tokens_seen': 17350372, 'train_runtime': '8777', 'train_tokens_per_second': '1977'} +{'loss': '2.363', 'grad_norm': '2.462', 'learning_rate': '4.993e-05', 'epoch': '0.2134', 'num_input_tokens_seen': 17352419, 'train_runtime': '8778', 'train_tokens_per_second': '1977'} +{'loss': '0.6875', 'grad_norm': '1.56', 'learning_rate': '4.993e-05', 'epoch': '0.2135', 'num_input_tokens_seen': 17354466, 'train_runtime': '8779', 'train_tokens_per_second': '1977'} +{'loss': '1.076', 'grad_norm': '1.794', 'learning_rate': '4.993e-05', 'epoch': '0.2135', 'num_input_tokens_seen': 17356513, 'train_runtime': '8780', 'train_tokens_per_second': '1977'} +{'loss': '0.7033', 'grad_norm': '1.178', 'learning_rate': '4.993e-05', 'epoch': '0.2135', 'num_input_tokens_seen': 17358560, 'train_runtime': '8781', 'train_tokens_per_second': '1977'} +{'loss': '1.447', 'grad_norm': '2.299', 'learning_rate': '4.993e-05', 'epoch': '0.2135', 'num_input_tokens_seen': 17360607, 'train_runtime': '8782', 'train_tokens_per_second': '1977'} +{'loss': '0.5963', 'grad_norm': '0.9923', 'learning_rate': '4.993e-05', 'epoch': '0.2136', 'num_input_tokens_seen': 17362654, 'train_runtime': '8783', 'train_tokens_per_second': '1977'} +{'loss': '1.13', 'grad_norm': '2.053', 'learning_rate': '4.993e-05', 'epoch': '0.2136', 'num_input_tokens_seen': 17364701, 'train_runtime': '8784', 'train_tokens_per_second': '1977'} +{'loss': '0.7191', 'grad_norm': '1.199', 'learning_rate': '4.993e-05', 'epoch': '0.2136', 'num_input_tokens_seen': 17366748, 'train_runtime': '8785', 'train_tokens_per_second': '1977'} +{'loss': '0.3792', 'grad_norm': '0.9307', 'learning_rate': '4.993e-05', 'epoch': '0.2136', 'num_input_tokens_seen': 17368795, 'train_runtime': '8786', 'train_tokens_per_second': '1977'} +{'loss': '0.2909', 'grad_norm': '0.9665', 'learning_rate': '4.993e-05', 'epoch': '0.2137', 'num_input_tokens_seen': 17370842, 'train_runtime': '8787', 'train_tokens_per_second': '1977'} +{'loss': '0.3139', 'grad_norm': '0.9014', 'learning_rate': '4.993e-05', 'epoch': '0.2137', 'num_input_tokens_seen': 17372889, 'train_runtime': '8788', 'train_tokens_per_second': '1977'} +{'loss': '1.844', 'grad_norm': '2.13', 'learning_rate': '4.993e-05', 'epoch': '0.2137', 'num_input_tokens_seen': 17374936, 'train_runtime': '8789', 'train_tokens_per_second': '1977'} +{'loss': '0.863', 'grad_norm': '1.325', 'learning_rate': '4.993e-05', 'epoch': '0.2137', 'num_input_tokens_seen': 17376983, 'train_runtime': '8790', 'train_tokens_per_second': '1977'} +{'loss': '0.5447', 'grad_norm': '1.099', 'learning_rate': '4.993e-05', 'epoch': '0.2138', 'num_input_tokens_seen': 17379030, 'train_runtime': '8791', 'train_tokens_per_second': '1977'} +{'loss': '1.18', 'grad_norm': '1.785', 'learning_rate': '4.993e-05', 'epoch': '0.2138', 'num_input_tokens_seen': 17381077, 'train_runtime': '8793', 'train_tokens_per_second': '1977'} +{'loss': '0.8458', 'grad_norm': '1.146', 'learning_rate': '4.993e-05', 'epoch': '0.2138', 'num_input_tokens_seen': 17383124, 'train_runtime': '8794', 'train_tokens_per_second': '1977'} +{'loss': '0.6399', 'grad_norm': '1.209', 'learning_rate': '4.993e-05', 'epoch': '0.2138', 'num_input_tokens_seen': 17385171, 'train_runtime': '8795', 'train_tokens_per_second': '1977'} +{'loss': '0.7746', 'grad_norm': '1.784', 'learning_rate': '4.993e-05', 'epoch': '0.2139', 'num_input_tokens_seen': 17387218, 'train_runtime': '8796', 'train_tokens_per_second': '1977'} +{'loss': '0.601', 'grad_norm': '1.158', 'learning_rate': '4.993e-05', 'epoch': '0.2139', 'num_input_tokens_seen': 17389265, 'train_runtime': '8797', 'train_tokens_per_second': '1977'} +{'loss': '0.3325', 'grad_norm': '0.7836', 'learning_rate': '4.993e-05', 'epoch': '0.2139', 'num_input_tokens_seen': 17391312, 'train_runtime': '8798', 'train_tokens_per_second': '1977'} +{'loss': '0.4524', 'grad_norm': '1.103', 'learning_rate': '4.993e-05', 'epoch': '0.2139', 'num_input_tokens_seen': 17393359, 'train_runtime': '8799', 'train_tokens_per_second': '1977'} +{'loss': '0.7716', 'grad_norm': '1.146', 'learning_rate': '4.993e-05', 'epoch': '0.214', 'num_input_tokens_seen': 17395406, 'train_runtime': '8800', 'train_tokens_per_second': '1977'} +{'loss': '0.7337', 'grad_norm': '1.22', 'learning_rate': '4.993e-05', 'epoch': '0.214', 'num_input_tokens_seen': 17397453, 'train_runtime': '8801', 'train_tokens_per_second': '1977'} +{'loss': '0.474', 'grad_norm': '1.229', 'learning_rate': '4.993e-05', 'epoch': '0.214', 'num_input_tokens_seen': 17399500, 'train_runtime': '8802', 'train_tokens_per_second': '1977'} +{'loss': '1.679', 'grad_norm': '2.542', 'learning_rate': '4.993e-05', 'epoch': '0.214', 'num_input_tokens_seen': 17401547, 'train_runtime': '8803', 'train_tokens_per_second': '1977'} +{'loss': '1.041', 'grad_norm': '1.623', 'learning_rate': '4.993e-05', 'epoch': '0.2141', 'num_input_tokens_seen': 17403594, 'train_runtime': '8804', 'train_tokens_per_second': '1977'} +{'loss': '0.3644', 'grad_norm': '0.9454', 'learning_rate': '4.993e-05', 'epoch': '0.2141', 'num_input_tokens_seen': 17405641, 'train_runtime': '8805', 'train_tokens_per_second': '1977'} +{'loss': '0.6194', 'grad_norm': '1.246', 'learning_rate': '4.993e-05', 'epoch': '0.2141', 'num_input_tokens_seen': 17407688, 'train_runtime': '8806', 'train_tokens_per_second': '1977'} +{'loss': '1.141', 'grad_norm': '1.97', 'learning_rate': '4.993e-05', 'epoch': '0.2141', 'num_input_tokens_seen': 17409735, 'train_runtime': '8807', 'train_tokens_per_second': '1977'} +{'loss': '0.9806', 'grad_norm': '1.917', 'learning_rate': '4.993e-05', 'epoch': '0.2142', 'num_input_tokens_seen': 17411782, 'train_runtime': '8808', 'train_tokens_per_second': '1977'} +{'loss': '0.7497', 'grad_norm': '1.318', 'learning_rate': '4.993e-05', 'epoch': '0.2142', 'num_input_tokens_seen': 17413829, 'train_runtime': '8809', 'train_tokens_per_second': '1977'} +{'loss': '0.7761', 'grad_norm': '1.703', 'learning_rate': '4.993e-05', 'epoch': '0.2142', 'num_input_tokens_seen': 17415876, 'train_runtime': '8810', 'train_tokens_per_second': '1977'} +{'loss': '0.3615', 'grad_norm': '0.808', 'learning_rate': '4.993e-05', 'epoch': '0.2142', 'num_input_tokens_seen': 17417923, 'train_runtime': '8811', 'train_tokens_per_second': '1977'} +{'loss': '2.054', 'grad_norm': '2.47', 'learning_rate': '4.993e-05', 'epoch': '0.2143', 'num_input_tokens_seen': 17419970, 'train_runtime': '8812', 'train_tokens_per_second': '1977'} +{'loss': '1.385', 'grad_norm': '1.917', 'learning_rate': '4.993e-05', 'epoch': '0.2143', 'num_input_tokens_seen': 17422017, 'train_runtime': '8813', 'train_tokens_per_second': '1977'} +{'loss': '0.2437', 'grad_norm': '0.9099', 'learning_rate': '4.993e-05', 'epoch': '0.2143', 'num_input_tokens_seen': 17424064, 'train_runtime': '8814', 'train_tokens_per_second': '1977'} +{'loss': '2.108', 'grad_norm': '2.16', 'learning_rate': '4.993e-05', 'epoch': '0.2143', 'num_input_tokens_seen': 17426111, 'train_runtime': '8815', 'train_tokens_per_second': '1977'} +{'loss': '0.2959', 'grad_norm': '0.8473', 'learning_rate': '4.993e-05', 'epoch': '0.2144', 'num_input_tokens_seen': 17428158, 'train_runtime': '8816', 'train_tokens_per_second': '1977'} +{'loss': '0.8293', 'grad_norm': '1.678', 'learning_rate': '4.993e-05', 'epoch': '0.2144', 'num_input_tokens_seen': 17430205, 'train_runtime': '8817', 'train_tokens_per_second': '1977'} +{'loss': '0.4528', 'grad_norm': '0.9906', 'learning_rate': '4.993e-05', 'epoch': '0.2144', 'num_input_tokens_seen': 17432252, 'train_runtime': '8818', 'train_tokens_per_second': '1977'} +{'loss': '0.5472', 'grad_norm': '1.251', 'learning_rate': '4.993e-05', 'epoch': '0.2144', 'num_input_tokens_seen': 17434299, 'train_runtime': '8819', 'train_tokens_per_second': '1977'} +{'loss': '1.241', 'grad_norm': '1.481', 'learning_rate': '4.993e-05', 'epoch': '0.2145', 'num_input_tokens_seen': 17436346, 'train_runtime': '8820', 'train_tokens_per_second': '1977'} +{'loss': '0.6846', 'grad_norm': '1.324', 'learning_rate': '4.993e-05', 'epoch': '0.2145', 'num_input_tokens_seen': 17438393, 'train_runtime': '8821', 'train_tokens_per_second': '1977'} +{'loss': '1.164', 'grad_norm': '2.205', 'learning_rate': '4.993e-05', 'epoch': '0.2145', 'num_input_tokens_seen': 17440440, 'train_runtime': '8823', 'train_tokens_per_second': '1977'} +{'loss': '0.5252', 'grad_norm': '0.9644', 'learning_rate': '4.993e-05', 'epoch': '0.2145', 'num_input_tokens_seen': 17442487, 'train_runtime': '8824', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '1.778', 'learning_rate': '4.993e-05', 'epoch': '0.2146', 'num_input_tokens_seen': 17444534, 'train_runtime': '8825', 'train_tokens_per_second': '1977'} +{'loss': '0.415', 'grad_norm': '1.066', 'learning_rate': '4.993e-05', 'epoch': '0.2146', 'num_input_tokens_seen': 17446581, 'train_runtime': '8826', 'train_tokens_per_second': '1977'} +{'loss': '0.6853', 'grad_norm': '1.423', 'learning_rate': '4.993e-05', 'epoch': '0.2146', 'num_input_tokens_seen': 17448628, 'train_runtime': '8827', 'train_tokens_per_second': '1977'} +{'loss': '0.3732', 'grad_norm': '0.9475', 'learning_rate': '4.993e-05', 'epoch': '0.2146', 'num_input_tokens_seen': 17450675, 'train_runtime': '8828', 'train_tokens_per_second': '1977'} +{'loss': '0.3639', 'grad_norm': '0.9633', 'learning_rate': '4.993e-05', 'epoch': '0.2147', 'num_input_tokens_seen': 17452722, 'train_runtime': '8829', 'train_tokens_per_second': '1977'} +{'loss': '0.985', 'grad_norm': '1.406', 'learning_rate': '4.993e-05', 'epoch': '0.2147', 'num_input_tokens_seen': 17454769, 'train_runtime': '8830', 'train_tokens_per_second': '1977'} +{'loss': '0.6148', 'grad_norm': '1.278', 'learning_rate': '4.993e-05', 'epoch': '0.2147', 'num_input_tokens_seen': 17456816, 'train_runtime': '8831', 'train_tokens_per_second': '1977'} +{'loss': '1.698', 'grad_norm': '3.38', 'learning_rate': '4.993e-05', 'epoch': '0.2147', 'num_input_tokens_seen': 17458863, 'train_runtime': '8832', 'train_tokens_per_second': '1977'} +{'loss': '0.2596', 'grad_norm': '0.7867', 'learning_rate': '4.993e-05', 'epoch': '0.2148', 'num_input_tokens_seen': 17460910, 'train_runtime': '8833', 'train_tokens_per_second': '1977'} +{'loss': '0.7534', 'grad_norm': '1.566', 'learning_rate': '4.993e-05', 'epoch': '0.2148', 'num_input_tokens_seen': 17462957, 'train_runtime': '8834', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '2.431', 'learning_rate': '4.993e-05', 'epoch': '0.2148', 'num_input_tokens_seen': 17465004, 'train_runtime': '8835', 'train_tokens_per_second': '1977'} +{'loss': '0.2347', 'grad_norm': '0.8791', 'learning_rate': '4.993e-05', 'epoch': '0.2148', 'num_input_tokens_seen': 17467051, 'train_runtime': '8836', 'train_tokens_per_second': '1977'} +{'loss': '0.6289', 'grad_norm': '1.324', 'learning_rate': '4.993e-05', 'epoch': '0.2149', 'num_input_tokens_seen': 17469098, 'train_runtime': '8837', 'train_tokens_per_second': '1977'} +{'loss': '1.045', 'grad_norm': '1.586', 'learning_rate': '4.993e-05', 'epoch': '0.2149', 'num_input_tokens_seen': 17471145, 'train_runtime': '8838', 'train_tokens_per_second': '1977'} +{'loss': '0.2625', 'grad_norm': '0.7339', 'learning_rate': '4.993e-05', 'epoch': '0.2149', 'num_input_tokens_seen': 17473192, 'train_runtime': '8839', 'train_tokens_per_second': '1977'} +{'loss': '0.3373', 'grad_norm': '0.8175', 'learning_rate': '4.993e-05', 'epoch': '0.2149', 'num_input_tokens_seen': 17475239, 'train_runtime': '8840', 'train_tokens_per_second': '1977'} +{'loss': '0.7236', 'grad_norm': '1.248', 'learning_rate': '4.993e-05', 'epoch': '0.215', 'num_input_tokens_seen': 17477286, 'train_runtime': '8841', 'train_tokens_per_second': '1977'} +{'loss': '0.4539', 'grad_norm': '1.073', 'learning_rate': '4.993e-05', 'epoch': '0.215', 'num_input_tokens_seen': 17479333, 'train_runtime': '8842', 'train_tokens_per_second': '1977'} +{'loss': '0.3618', 'grad_norm': '0.7164', 'learning_rate': '4.993e-05', 'epoch': '0.215', 'num_input_tokens_seen': 17481380, 'train_runtime': '8843', 'train_tokens_per_second': '1977'} +{'loss': '0.8037', 'grad_norm': '1.397', 'learning_rate': '4.993e-05', 'epoch': '0.215', 'num_input_tokens_seen': 17483427, 'train_runtime': '8844', 'train_tokens_per_second': '1977'} +{'loss': '0.9614', 'grad_norm': '1.333', 'learning_rate': '4.993e-05', 'epoch': '0.2151', 'num_input_tokens_seen': 17485474, 'train_runtime': '8845', 'train_tokens_per_second': '1977'} +{'loss': '0.5837', 'grad_norm': '1.46', 'learning_rate': '4.993e-05', 'epoch': '0.2151', 'num_input_tokens_seen': 17487521, 'train_runtime': '8846', 'train_tokens_per_second': '1977'} +{'loss': '0.5075', 'grad_norm': '1.15', 'learning_rate': '4.993e-05', 'epoch': '0.2151', 'num_input_tokens_seen': 17489568, 'train_runtime': '8847', 'train_tokens_per_second': '1977'} +{'loss': '0.6292', 'grad_norm': '1.68', 'learning_rate': '4.993e-05', 'epoch': '0.2151', 'num_input_tokens_seen': 17491615, 'train_runtime': '8848', 'train_tokens_per_second': '1977'} +{'loss': '2.112', 'grad_norm': '2.811', 'learning_rate': '4.993e-05', 'epoch': '0.2152', 'num_input_tokens_seen': 17493662, 'train_runtime': '8849', 'train_tokens_per_second': '1977'} +{'loss': '0.3414', 'grad_norm': '0.8115', 'learning_rate': '4.993e-05', 'epoch': '0.2152', 'num_input_tokens_seen': 17495709, 'train_runtime': '8850', 'train_tokens_per_second': '1977'} +{'loss': '0.542', 'grad_norm': '1.122', 'learning_rate': '4.993e-05', 'epoch': '0.2152', 'num_input_tokens_seen': 17497756, 'train_runtime': '8851', 'train_tokens_per_second': '1977'} +{'loss': '0.8212', 'grad_norm': '1.375', 'learning_rate': '4.993e-05', 'epoch': '0.2152', 'num_input_tokens_seen': 17499803, 'train_runtime': '8852', 'train_tokens_per_second': '1977'} +{'loss': '0.6676', 'grad_norm': '1.124', 'learning_rate': '4.993e-05', 'epoch': '0.2153', 'num_input_tokens_seen': 17501850, 'train_runtime': '8854', 'train_tokens_per_second': '1977'} +{'loss': '0.9535', 'grad_norm': '1.61', 'learning_rate': '4.993e-05', 'epoch': '0.2153', 'num_input_tokens_seen': 17503897, 'train_runtime': '8855', 'train_tokens_per_second': '1977'} +{'loss': '1.498', 'grad_norm': '2.142', 'learning_rate': '4.993e-05', 'epoch': '0.2153', 'num_input_tokens_seen': 17505944, 'train_runtime': '8856', 'train_tokens_per_second': '1977'} +{'loss': '0.4164', 'grad_norm': '0.9122', 'learning_rate': '4.993e-05', 'epoch': '0.2153', 'num_input_tokens_seen': 17507991, 'train_runtime': '8857', 'train_tokens_per_second': '1977'} +{'loss': '0.2106', 'grad_norm': '0.9418', 'learning_rate': '4.993e-05', 'epoch': '0.2154', 'num_input_tokens_seen': 17510038, 'train_runtime': '8858', 'train_tokens_per_second': '1977'} +{'loss': '0.8307', 'grad_norm': '1.48', 'learning_rate': '4.993e-05', 'epoch': '0.2154', 'num_input_tokens_seen': 17512085, 'train_runtime': '8859', 'train_tokens_per_second': '1977'} +{'loss': '2.221', 'grad_norm': '2.321', 'learning_rate': '4.993e-05', 'epoch': '0.2154', 'num_input_tokens_seen': 17514132, 'train_runtime': '8860', 'train_tokens_per_second': '1977'} +{'loss': '0.577', 'grad_norm': '1.239', 'learning_rate': '4.993e-05', 'epoch': '0.2154', 'num_input_tokens_seen': 17516179, 'train_runtime': '8861', 'train_tokens_per_second': '1977'} +{'loss': '1.11', 'grad_norm': '1.992', 'learning_rate': '4.993e-05', 'epoch': '0.2155', 'num_input_tokens_seen': 17518226, 'train_runtime': '8862', 'train_tokens_per_second': '1977'} +{'loss': '0.2998', 'grad_norm': '0.901', 'learning_rate': '4.993e-05', 'epoch': '0.2155', 'num_input_tokens_seen': 17520273, 'train_runtime': '8863', 'train_tokens_per_second': '1977'} +{'loss': '0.8059', 'grad_norm': '1.482', 'learning_rate': '4.993e-05', 'epoch': '0.2155', 'num_input_tokens_seen': 17522320, 'train_runtime': '8864', 'train_tokens_per_second': '1977'} +{'loss': '0.3941', 'grad_norm': '0.8636', 'learning_rate': '4.993e-05', 'epoch': '0.2156', 'num_input_tokens_seen': 17524367, 'train_runtime': '8865', 'train_tokens_per_second': '1977'} +{'loss': '0.4399', 'grad_norm': '1.126', 'learning_rate': '4.993e-05', 'epoch': '0.2156', 'num_input_tokens_seen': 17526414, 'train_runtime': '8866', 'train_tokens_per_second': '1977'} +{'loss': '0.2896', 'grad_norm': '0.9902', 'learning_rate': '4.993e-05', 'epoch': '0.2156', 'num_input_tokens_seen': 17528461, 'train_runtime': '8867', 'train_tokens_per_second': '1977'} +{'loss': '1.187', 'grad_norm': '1.264', 'learning_rate': '4.993e-05', 'epoch': '0.2156', 'num_input_tokens_seen': 17530508, 'train_runtime': '8868', 'train_tokens_per_second': '1977'} +{'loss': '0.3217', 'grad_norm': '0.7611', 'learning_rate': '4.993e-05', 'epoch': '0.2157', 'num_input_tokens_seen': 17532555, 'train_runtime': '8869', 'train_tokens_per_second': '1977'} +{'loss': '1.286', 'grad_norm': '1.874', 'learning_rate': '4.993e-05', 'epoch': '0.2157', 'num_input_tokens_seen': 17534602, 'train_runtime': '8870', 'train_tokens_per_second': '1977'} +{'loss': '0.4578', 'grad_norm': '0.9367', 'learning_rate': '4.993e-05', 'epoch': '0.2157', 'num_input_tokens_seen': 17536649, 'train_runtime': '8871', 'train_tokens_per_second': '1977'} +{'loss': '0.6013', 'grad_norm': '1.097', 'learning_rate': '4.993e-05', 'epoch': '0.2157', 'num_input_tokens_seen': 17538696, 'train_runtime': '8872', 'train_tokens_per_second': '1977'} +{'loss': '0.3491', 'grad_norm': '0.8046', 'learning_rate': '4.993e-05', 'epoch': '0.2158', 'num_input_tokens_seen': 17540743, 'train_runtime': '8873', 'train_tokens_per_second': '1977'} +{'loss': '0.7562', 'grad_norm': '1.381', 'learning_rate': '4.993e-05', 'epoch': '0.2158', 'num_input_tokens_seen': 17542790, 'train_runtime': '8874', 'train_tokens_per_second': '1977'} +{'loss': '0.5175', 'grad_norm': '1.076', 'learning_rate': '4.993e-05', 'epoch': '0.2158', 'num_input_tokens_seen': 17544837, 'train_runtime': '8875', 'train_tokens_per_second': '1977'} +{'loss': '1.939', 'grad_norm': '2.771', 'learning_rate': '4.993e-05', 'epoch': '0.2158', 'num_input_tokens_seen': 17546884, 'train_runtime': '8876', 'train_tokens_per_second': '1977'} +{'loss': '0.3554', 'grad_norm': '0.7533', 'learning_rate': '4.993e-05', 'epoch': '0.2159', 'num_input_tokens_seen': 17548931, 'train_runtime': '8877', 'train_tokens_per_second': '1977'} +{'loss': '0.4385', 'grad_norm': '1.076', 'learning_rate': '4.993e-05', 'epoch': '0.2159', 'num_input_tokens_seen': 17550978, 'train_runtime': '8878', 'train_tokens_per_second': '1977'} +{'loss': '0.9245', 'grad_norm': '1.404', 'learning_rate': '4.993e-05', 'epoch': '0.2159', 'num_input_tokens_seen': 17553025, 'train_runtime': '8879', 'train_tokens_per_second': '1977'} +{'loss': '0.8354', 'grad_norm': '1.737', 'learning_rate': '4.993e-05', 'epoch': '0.2159', 'num_input_tokens_seen': 17555072, 'train_runtime': '8880', 'train_tokens_per_second': '1977'} +{'loss': '0.2429', 'grad_norm': '0.9372', 'learning_rate': '4.993e-05', 'epoch': '0.216', 'num_input_tokens_seen': 17557119, 'train_runtime': '8881', 'train_tokens_per_second': '1977'} +{'loss': '0.7804', 'grad_norm': '1.049', 'learning_rate': '4.993e-05', 'epoch': '0.216', 'num_input_tokens_seen': 17559166, 'train_runtime': '8882', 'train_tokens_per_second': '1977'} +{'loss': '0.5127', 'grad_norm': '1.219', 'learning_rate': '4.993e-05', 'epoch': '0.216', 'num_input_tokens_seen': 17561213, 'train_runtime': '8884', 'train_tokens_per_second': '1977'} +{'loss': '0.3017', 'grad_norm': '0.8696', 'learning_rate': '4.993e-05', 'epoch': '0.216', 'num_input_tokens_seen': 17563260, 'train_runtime': '8885', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '1.525', 'learning_rate': '4.993e-05', 'epoch': '0.2161', 'num_input_tokens_seen': 17565307, 'train_runtime': '8886', 'train_tokens_per_second': '1977'} +{'loss': '0.9521', 'grad_norm': '1.385', 'learning_rate': '4.993e-05', 'epoch': '0.2161', 'num_input_tokens_seen': 17567354, 'train_runtime': '8887', 'train_tokens_per_second': '1977'} +{'loss': '0.415', 'grad_norm': '1.148', 'learning_rate': '4.993e-05', 'epoch': '0.2161', 'num_input_tokens_seen': 17569401, 'train_runtime': '8888', 'train_tokens_per_second': '1977'} +{'loss': '0.296', 'grad_norm': '0.6823', 'learning_rate': '4.993e-05', 'epoch': '0.2161', 'num_input_tokens_seen': 17571448, 'train_runtime': '8889', 'train_tokens_per_second': '1977'} +{'loss': '0.4453', 'grad_norm': '1.084', 'learning_rate': '4.993e-05', 'epoch': '0.2162', 'num_input_tokens_seen': 17573495, 'train_runtime': '8890', 'train_tokens_per_second': '1977'} +{'loss': '0.4775', 'grad_norm': '1.104', 'learning_rate': '4.993e-05', 'epoch': '0.2162', 'num_input_tokens_seen': 17575542, 'train_runtime': '8891', 'train_tokens_per_second': '1977'} +{'loss': '0.7279', 'grad_norm': '1.362', 'learning_rate': '4.993e-05', 'epoch': '0.2162', 'num_input_tokens_seen': 17577589, 'train_runtime': '8892', 'train_tokens_per_second': '1977'} +{'loss': '1.091', 'grad_norm': '1.349', 'learning_rate': '4.993e-05', 'epoch': '0.2162', 'num_input_tokens_seen': 17579636, 'train_runtime': '8893', 'train_tokens_per_second': '1977'} +{'loss': '0.3905', 'grad_norm': '0.8924', 'learning_rate': '4.993e-05', 'epoch': '0.2163', 'num_input_tokens_seen': 17581683, 'train_runtime': '8894', 'train_tokens_per_second': '1977'} +{'loss': '0.3237', 'grad_norm': '1.057', 'learning_rate': '4.993e-05', 'epoch': '0.2163', 'num_input_tokens_seen': 17583730, 'train_runtime': '8895', 'train_tokens_per_second': '1977'} +{'loss': '0.2137', 'grad_norm': '0.9308', 'learning_rate': '4.993e-05', 'epoch': '0.2163', 'num_input_tokens_seen': 17585777, 'train_runtime': '8896', 'train_tokens_per_second': '1977'} +{'loss': '0.7744', 'grad_norm': '1.835', 'learning_rate': '4.993e-05', 'epoch': '0.2163', 'num_input_tokens_seen': 17587824, 'train_runtime': '8897', 'train_tokens_per_second': '1977'} +{'loss': '0.3117', 'grad_norm': '1.085', 'learning_rate': '4.993e-05', 'epoch': '0.2164', 'num_input_tokens_seen': 17589871, 'train_runtime': '8898', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '2.118', 'learning_rate': '4.993e-05', 'epoch': '0.2164', 'num_input_tokens_seen': 17591918, 'train_runtime': '8899', 'train_tokens_per_second': '1977'} +{'loss': '0.454', 'grad_norm': '1.277', 'learning_rate': '4.993e-05', 'epoch': '0.2164', 'num_input_tokens_seen': 17593965, 'train_runtime': '8900', 'train_tokens_per_second': '1977'} +{'loss': '0.6148', 'grad_norm': '1.386', 'learning_rate': '4.993e-05', 'epoch': '0.2164', 'num_input_tokens_seen': 17596012, 'train_runtime': '8901', 'train_tokens_per_second': '1977'} +{'loss': '0.6952', 'grad_norm': '1.152', 'learning_rate': '4.993e-05', 'epoch': '0.2165', 'num_input_tokens_seen': 17598059, 'train_runtime': '8902', 'train_tokens_per_second': '1977'} +{'loss': '0.8019', 'grad_norm': '1.298', 'learning_rate': '4.993e-05', 'epoch': '0.2165', 'num_input_tokens_seen': 17600106, 'train_runtime': '8903', 'train_tokens_per_second': '1977'} +{'loss': '0.3619', 'grad_norm': '1.235', 'learning_rate': '4.993e-05', 'epoch': '0.2165', 'num_input_tokens_seen': 17602153, 'train_runtime': '8904', 'train_tokens_per_second': '1977'} +{'loss': '0.3425', 'grad_norm': '0.9485', 'learning_rate': '4.993e-05', 'epoch': '0.2165', 'num_input_tokens_seen': 17604200, 'train_runtime': '8905', 'train_tokens_per_second': '1977'} +{'loss': '0.4819', 'grad_norm': '1.186', 'learning_rate': '4.993e-05', 'epoch': '0.2166', 'num_input_tokens_seen': 17606247, 'train_runtime': '8906', 'train_tokens_per_second': '1977'} +{'loss': '0.336', 'grad_norm': '0.8427', 'learning_rate': '4.993e-05', 'epoch': '0.2166', 'num_input_tokens_seen': 17608294, 'train_runtime': '8907', 'train_tokens_per_second': '1977'} +{'loss': '1.431', 'grad_norm': '2.201', 'learning_rate': '4.993e-05', 'epoch': '0.2166', 'num_input_tokens_seen': 17610341, 'train_runtime': '8908', 'train_tokens_per_second': '1977'} +{'loss': '1.885', 'grad_norm': '2.133', 'learning_rate': '4.993e-05', 'epoch': '0.2166', 'num_input_tokens_seen': 17612388, 'train_runtime': '8909', 'train_tokens_per_second': '1977'} +{'loss': '0.5604', 'grad_norm': '1.176', 'learning_rate': '4.993e-05', 'epoch': '0.2167', 'num_input_tokens_seen': 17614435, 'train_runtime': '8910', 'train_tokens_per_second': '1977'} +{'loss': '0.5251', 'grad_norm': '1.244', 'learning_rate': '4.993e-05', 'epoch': '0.2167', 'num_input_tokens_seen': 17616482, 'train_runtime': '8911', 'train_tokens_per_second': '1977'} +{'loss': '0.2588', 'grad_norm': '0.901', 'learning_rate': '4.993e-05', 'epoch': '0.2167', 'num_input_tokens_seen': 17618529, 'train_runtime': '8912', 'train_tokens_per_second': '1977'} +{'loss': '0.3997', 'grad_norm': '1.026', 'learning_rate': '4.993e-05', 'epoch': '0.2167', 'num_input_tokens_seen': 17620576, 'train_runtime': '8914', 'train_tokens_per_second': '1977'} +{'loss': '0.8815', 'grad_norm': '1.291', 'learning_rate': '4.993e-05', 'epoch': '0.2168', 'num_input_tokens_seen': 17622623, 'train_runtime': '8915', 'train_tokens_per_second': '1977'} +{'loss': '1.603', 'grad_norm': '2.24', 'learning_rate': '4.993e-05', 'epoch': '0.2168', 'num_input_tokens_seen': 17624670, 'train_runtime': '8916', 'train_tokens_per_second': '1977'} +{'loss': '0.544', 'grad_norm': '1.293', 'learning_rate': '4.993e-05', 'epoch': '0.2168', 'num_input_tokens_seen': 17626717, 'train_runtime': '8917', 'train_tokens_per_second': '1977'} +{'loss': '0.3021', 'grad_norm': '0.8496', 'learning_rate': '4.993e-05', 'epoch': '0.2168', 'num_input_tokens_seen': 17628764, 'train_runtime': '8918', 'train_tokens_per_second': '1977'} +{'loss': '1.383', 'grad_norm': '2.338', 'learning_rate': '4.993e-05', 'epoch': '0.2169', 'num_input_tokens_seen': 17630811, 'train_runtime': '8919', 'train_tokens_per_second': '1977'} +{'loss': '0.378', 'grad_norm': '0.9626', 'learning_rate': '4.993e-05', 'epoch': '0.2169', 'num_input_tokens_seen': 17632858, 'train_runtime': '8920', 'train_tokens_per_second': '1977'} +{'loss': '1.324', 'grad_norm': '2.128', 'learning_rate': '4.993e-05', 'epoch': '0.2169', 'num_input_tokens_seen': 17634905, 'train_runtime': '8921', 'train_tokens_per_second': '1977'} +{'loss': '1.296', 'grad_norm': '1.693', 'learning_rate': '4.993e-05', 'epoch': '0.2169', 'num_input_tokens_seen': 17636952, 'train_runtime': '8922', 'train_tokens_per_second': '1977'} +{'loss': '1.497', 'grad_norm': '2.322', 'learning_rate': '4.993e-05', 'epoch': '0.217', 'num_input_tokens_seen': 17638999, 'train_runtime': '8923', 'train_tokens_per_second': '1977'} +{'loss': '0.3665', 'grad_norm': '1.061', 'learning_rate': '4.993e-05', 'epoch': '0.217', 'num_input_tokens_seen': 17641046, 'train_runtime': '8924', 'train_tokens_per_second': '1977'} +{'loss': '0.8723', 'grad_norm': '1.381', 'learning_rate': '4.993e-05', 'epoch': '0.217', 'num_input_tokens_seen': 17643093, 'train_runtime': '8925', 'train_tokens_per_second': '1977'} +{'loss': '0.4305', 'grad_norm': '1.157', 'learning_rate': '4.993e-05', 'epoch': '0.217', 'num_input_tokens_seen': 17645140, 'train_runtime': '8926', 'train_tokens_per_second': '1977'} +{'loss': '0.4557', 'grad_norm': '1.227', 'learning_rate': '4.993e-05', 'epoch': '0.2171', 'num_input_tokens_seen': 17647187, 'train_runtime': '8927', 'train_tokens_per_second': '1977'} +{'loss': '0.3116', 'grad_norm': '0.7647', 'learning_rate': '4.993e-05', 'epoch': '0.2171', 'num_input_tokens_seen': 17649234, 'train_runtime': '8928', 'train_tokens_per_second': '1977'} +{'loss': '0.8921', 'grad_norm': '1.638', 'learning_rate': '4.993e-05', 'epoch': '0.2171', 'num_input_tokens_seen': 17651281, 'train_runtime': '8929', 'train_tokens_per_second': '1977'} +{'loss': '1.305', 'grad_norm': '1.938', 'learning_rate': '4.993e-05', 'epoch': '0.2171', 'num_input_tokens_seen': 17653328, 'train_runtime': '8930', 'train_tokens_per_second': '1977'} +{'loss': '0.7066', 'grad_norm': '1.369', 'learning_rate': '4.993e-05', 'epoch': '0.2172', 'num_input_tokens_seen': 17655375, 'train_runtime': '8931', 'train_tokens_per_second': '1977'} +{'loss': '0.6726', 'grad_norm': '0.9838', 'learning_rate': '4.993e-05', 'epoch': '0.2172', 'num_input_tokens_seen': 17657422, 'train_runtime': '8932', 'train_tokens_per_second': '1977'} +{'loss': '0.4786', 'grad_norm': '1.102', 'learning_rate': '4.993e-05', 'epoch': '0.2172', 'num_input_tokens_seen': 17659469, 'train_runtime': '8933', 'train_tokens_per_second': '1977'} +{'loss': '0.8322', 'grad_norm': '1.155', 'learning_rate': '4.993e-05', 'epoch': '0.2172', 'num_input_tokens_seen': 17661516, 'train_runtime': '8934', 'train_tokens_per_second': '1977'} +{'loss': '0.8901', 'grad_norm': '1.454', 'learning_rate': '4.993e-05', 'epoch': '0.2173', 'num_input_tokens_seen': 17663563, 'train_runtime': '8935', 'train_tokens_per_second': '1977'} +{'loss': '0.5441', 'grad_norm': '1.547', 'learning_rate': '4.993e-05', 'epoch': '0.2173', 'num_input_tokens_seen': 17665610, 'train_runtime': '8936', 'train_tokens_per_second': '1977'} +{'loss': '1.169', 'grad_norm': '2.013', 'learning_rate': '4.993e-05', 'epoch': '0.2173', 'num_input_tokens_seen': 17667657, 'train_runtime': '8937', 'train_tokens_per_second': '1977'} +{'loss': '0.6657', 'grad_norm': '1.415', 'learning_rate': '4.993e-05', 'epoch': '0.2173', 'num_input_tokens_seen': 17669704, 'train_runtime': '8938', 'train_tokens_per_second': '1977'} +{'loss': '0.9576', 'grad_norm': '1.349', 'learning_rate': '4.993e-05', 'epoch': '0.2174', 'num_input_tokens_seen': 17671751, 'train_runtime': '8939', 'train_tokens_per_second': '1977'} +{'loss': '1.028', 'grad_norm': '1.573', 'learning_rate': '4.993e-05', 'epoch': '0.2174', 'num_input_tokens_seen': 17673798, 'train_runtime': '8940', 'train_tokens_per_second': '1977'} +{'loss': '1.087', 'grad_norm': '1.416', 'learning_rate': '4.993e-05', 'epoch': '0.2174', 'num_input_tokens_seen': 17675845, 'train_runtime': '8941', 'train_tokens_per_second': '1977'} +{'loss': '0.5128', 'grad_norm': '1.251', 'learning_rate': '4.993e-05', 'epoch': '0.2174', 'num_input_tokens_seen': 17677892, 'train_runtime': '8942', 'train_tokens_per_second': '1977'} +{'loss': '1.587', 'grad_norm': '2.288', 'learning_rate': '4.993e-05', 'epoch': '0.2175', 'num_input_tokens_seen': 17679939, 'train_runtime': '8943', 'train_tokens_per_second': '1977'} +{'loss': '0.8313', 'grad_norm': '1.41', 'learning_rate': '4.993e-05', 'epoch': '0.2175', 'num_input_tokens_seen': 17681986, 'train_runtime': '8945', 'train_tokens_per_second': '1977'} +{'loss': '0.4798', 'grad_norm': '1.126', 'learning_rate': '4.993e-05', 'epoch': '0.2175', 'num_input_tokens_seen': 17684033, 'train_runtime': '8946', 'train_tokens_per_second': '1977'} +{'loss': '0.8005', 'grad_norm': '1.859', 'learning_rate': '4.993e-05', 'epoch': '0.2175', 'num_input_tokens_seen': 17686080, 'train_runtime': '8947', 'train_tokens_per_second': '1977'} +{'loss': '1.754', 'grad_norm': '2.327', 'learning_rate': '4.993e-05', 'epoch': '0.2176', 'num_input_tokens_seen': 17688127, 'train_runtime': '8948', 'train_tokens_per_second': '1977'} +{'loss': '0.9184', 'grad_norm': '1.275', 'learning_rate': '4.993e-05', 'epoch': '0.2176', 'num_input_tokens_seen': 17690174, 'train_runtime': '8949', 'train_tokens_per_second': '1977'} +{'loss': '0.294', 'grad_norm': '0.9683', 'learning_rate': '4.993e-05', 'epoch': '0.2176', 'num_input_tokens_seen': 17692221, 'train_runtime': '8950', 'train_tokens_per_second': '1977'} +{'loss': '0.5921', 'grad_norm': '1.338', 'learning_rate': '4.993e-05', 'epoch': '0.2176', 'num_input_tokens_seen': 17694268, 'train_runtime': '8951', 'train_tokens_per_second': '1977'} +{'loss': '0.4393', 'grad_norm': '1.037', 'learning_rate': '4.993e-05', 'epoch': '0.2177', 'num_input_tokens_seen': 17696315, 'train_runtime': '8952', 'train_tokens_per_second': '1977'} +{'loss': '1.2', 'grad_norm': '2.243', 'learning_rate': '4.993e-05', 'epoch': '0.2177', 'num_input_tokens_seen': 17698362, 'train_runtime': '8953', 'train_tokens_per_second': '1977'} +{'loss': '0.4282', 'grad_norm': '1.295', 'learning_rate': '4.993e-05', 'epoch': '0.2177', 'num_input_tokens_seen': 17700409, 'train_runtime': '8954', 'train_tokens_per_second': '1977'} +{'loss': '0.7491', 'grad_norm': '1.264', 'learning_rate': '4.993e-05', 'epoch': '0.2177', 'num_input_tokens_seen': 17702456, 'train_runtime': '8955', 'train_tokens_per_second': '1977'} +{'loss': '0.8949', 'grad_norm': '1.891', 'learning_rate': '4.993e-05', 'epoch': '0.2178', 'num_input_tokens_seen': 17704503, 'train_runtime': '8956', 'train_tokens_per_second': '1977'} +{'loss': '0.9135', 'grad_norm': '1.387', 'learning_rate': '4.993e-05', 'epoch': '0.2178', 'num_input_tokens_seen': 17706550, 'train_runtime': '8957', 'train_tokens_per_second': '1977'} +{'loss': '1.023', 'grad_norm': '1.83', 'learning_rate': '4.993e-05', 'epoch': '0.2178', 'num_input_tokens_seen': 17708597, 'train_runtime': '8958', 'train_tokens_per_second': '1977'} +{'loss': '0.8422', 'grad_norm': '1.342', 'learning_rate': '4.993e-05', 'epoch': '0.2178', 'num_input_tokens_seen': 17710644, 'train_runtime': '8959', 'train_tokens_per_second': '1977'} +{'loss': '0.3683', 'grad_norm': '1.189', 'learning_rate': '4.993e-05', 'epoch': '0.2179', 'num_input_tokens_seen': 17712691, 'train_runtime': '8960', 'train_tokens_per_second': '1977'} +{'loss': '0.175', 'grad_norm': '0.8742', 'learning_rate': '4.993e-05', 'epoch': '0.2179', 'num_input_tokens_seen': 17714738, 'train_runtime': '8961', 'train_tokens_per_second': '1977'} +{'loss': '0.4261', 'grad_norm': '1.016', 'learning_rate': '4.993e-05', 'epoch': '0.2179', 'num_input_tokens_seen': 17716785, 'train_runtime': '8962', 'train_tokens_per_second': '1977'} +{'loss': '0.4216', 'grad_norm': '0.9898', 'learning_rate': '4.993e-05', 'epoch': '0.2179', 'num_input_tokens_seen': 17718832, 'train_runtime': '8963', 'train_tokens_per_second': '1977'} +{'loss': '0.7707', 'grad_norm': '1.347', 'learning_rate': '4.993e-05', 'epoch': '0.218', 'num_input_tokens_seen': 17720879, 'train_runtime': '8964', 'train_tokens_per_second': '1977'} +{'loss': '0.548', 'grad_norm': '1.113', 'learning_rate': '4.993e-05', 'epoch': '0.218', 'num_input_tokens_seen': 17722926, 'train_runtime': '8965', 'train_tokens_per_second': '1977'} +{'loss': '0.4353', 'grad_norm': '1.192', 'learning_rate': '4.993e-05', 'epoch': '0.218', 'num_input_tokens_seen': 17724973, 'train_runtime': '8966', 'train_tokens_per_second': '1977'} +{'loss': '1.635', 'grad_norm': '2.493', 'learning_rate': '4.993e-05', 'epoch': '0.218', 'num_input_tokens_seen': 17727020, 'train_runtime': '8967', 'train_tokens_per_second': '1977'} +{'loss': '0.5465', 'grad_norm': '1.096', 'learning_rate': '4.993e-05', 'epoch': '0.2181', 'num_input_tokens_seen': 17729067, 'train_runtime': '8968', 'train_tokens_per_second': '1977'} +{'loss': '0.5893', 'grad_norm': '1.441', 'learning_rate': '4.993e-05', 'epoch': '0.2181', 'num_input_tokens_seen': 17731114, 'train_runtime': '8969', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '1.77', 'learning_rate': '4.993e-05', 'epoch': '0.2181', 'num_input_tokens_seen': 17733161, 'train_runtime': '8970', 'train_tokens_per_second': '1977'} +{'loss': '0.753', 'grad_norm': '1.7', 'learning_rate': '4.993e-05', 'epoch': '0.2181', 'num_input_tokens_seen': 17735208, 'train_runtime': '8971', 'train_tokens_per_second': '1977'} +{'loss': '0.7241', 'grad_norm': '1.395', 'learning_rate': '4.993e-05', 'epoch': '0.2182', 'num_input_tokens_seen': 17737255, 'train_runtime': '8972', 'train_tokens_per_second': '1977'} +{'loss': '0.9857', 'grad_norm': '1.684', 'learning_rate': '4.993e-05', 'epoch': '0.2182', 'num_input_tokens_seen': 17739302, 'train_runtime': '8973', 'train_tokens_per_second': '1977'} +{'loss': '0.3649', 'grad_norm': '1.055', 'learning_rate': '4.993e-05', 'epoch': '0.2182', 'num_input_tokens_seen': 17741349, 'train_runtime': '8975', 'train_tokens_per_second': '1977'} +{'loss': '0.5128', 'grad_norm': '1.095', 'learning_rate': '4.993e-05', 'epoch': '0.2182', 'num_input_tokens_seen': 17743396, 'train_runtime': '8976', 'train_tokens_per_second': '1977'} +{'loss': '0.5806', 'grad_norm': '1.18', 'learning_rate': '4.993e-05', 'epoch': '0.2183', 'num_input_tokens_seen': 17745443, 'train_runtime': '8977', 'train_tokens_per_second': '1977'} +{'loss': '0.6588', 'grad_norm': '1.301', 'learning_rate': '4.993e-05', 'epoch': '0.2183', 'num_input_tokens_seen': 17747490, 'train_runtime': '8978', 'train_tokens_per_second': '1977'} +{'loss': '0.609', 'grad_norm': '1.505', 'learning_rate': '4.993e-05', 'epoch': '0.2183', 'num_input_tokens_seen': 17749537, 'train_runtime': '8979', 'train_tokens_per_second': '1977'} +{'loss': '1.485', 'grad_norm': '2.099', 'learning_rate': '4.993e-05', 'epoch': '0.2183', 'num_input_tokens_seen': 17751584, 'train_runtime': '8980', 'train_tokens_per_second': '1977'} +{'loss': '0.393', 'grad_norm': '1.103', 'learning_rate': '4.993e-05', 'epoch': '0.2184', 'num_input_tokens_seen': 17753631, 'train_runtime': '8981', 'train_tokens_per_second': '1977'} +{'loss': '0.4209', 'grad_norm': '1.067', 'learning_rate': '4.993e-05', 'epoch': '0.2184', 'num_input_tokens_seen': 17755678, 'train_runtime': '8982', 'train_tokens_per_second': '1977'} +{'loss': '2.07', 'grad_norm': '2.552', 'learning_rate': '4.993e-05', 'epoch': '0.2184', 'num_input_tokens_seen': 17757725, 'train_runtime': '8983', 'train_tokens_per_second': '1977'} +{'loss': '0.3734', 'grad_norm': '1.03', 'learning_rate': '4.993e-05', 'epoch': '0.2184', 'num_input_tokens_seen': 17759772, 'train_runtime': '8984', 'train_tokens_per_second': '1977'} +{'loss': '0.8234', 'grad_norm': '1.906', 'learning_rate': '4.993e-05', 'epoch': '0.2185', 'num_input_tokens_seen': 17761819, 'train_runtime': '8985', 'train_tokens_per_second': '1977'} +{'loss': '0.411', 'grad_norm': '1.052', 'learning_rate': '4.993e-05', 'epoch': '0.2185', 'num_input_tokens_seen': 17763866, 'train_runtime': '8986', 'train_tokens_per_second': '1977'} +{'loss': '0.2899', 'grad_norm': '0.913', 'learning_rate': '4.993e-05', 'epoch': '0.2185', 'num_input_tokens_seen': 17765913, 'train_runtime': '8987', 'train_tokens_per_second': '1977'} +{'loss': '1.315', 'grad_norm': '1.964', 'learning_rate': '4.993e-05', 'epoch': '0.2185', 'num_input_tokens_seen': 17767960, 'train_runtime': '8988', 'train_tokens_per_second': '1977'} +{'loss': '0.2731', 'grad_norm': '1.12', 'learning_rate': '4.993e-05', 'epoch': '0.2186', 'num_input_tokens_seen': 17770007, 'train_runtime': '8989', 'train_tokens_per_second': '1977'} +{'loss': '1.289', 'grad_norm': '2.038', 'learning_rate': '4.993e-05', 'epoch': '0.2186', 'num_input_tokens_seen': 17772054, 'train_runtime': '8990', 'train_tokens_per_second': '1977'} +{'loss': '0.8625', 'grad_norm': '1.074', 'learning_rate': '4.993e-05', 'epoch': '0.2186', 'num_input_tokens_seen': 17774101, 'train_runtime': '8991', 'train_tokens_per_second': '1977'} +{'loss': '1.12', 'grad_norm': '1.917', 'learning_rate': '4.993e-05', 'epoch': '0.2186', 'num_input_tokens_seen': 17776148, 'train_runtime': '8992', 'train_tokens_per_second': '1977'} +{'loss': '0.8431', 'grad_norm': '1.395', 'learning_rate': '4.993e-05', 'epoch': '0.2187', 'num_input_tokens_seen': 17778195, 'train_runtime': '8993', 'train_tokens_per_second': '1977'} +{'loss': '0.7854', 'grad_norm': '1.122', 'learning_rate': '4.993e-05', 'epoch': '0.2187', 'num_input_tokens_seen': 17780242, 'train_runtime': '8994', 'train_tokens_per_second': '1977'} +{'loss': '0.5362', 'grad_norm': '1.335', 'learning_rate': '4.993e-05', 'epoch': '0.2187', 'num_input_tokens_seen': 17782289, 'train_runtime': '8995', 'train_tokens_per_second': '1977'} +{'loss': '1.361', 'grad_norm': '2.642', 'learning_rate': '4.993e-05', 'epoch': '0.2187', 'num_input_tokens_seen': 17784336, 'train_runtime': '8996', 'train_tokens_per_second': '1977'} +{'loss': '1.483', 'grad_norm': '2.413', 'learning_rate': '4.993e-05', 'epoch': '0.2188', 'num_input_tokens_seen': 17786383, 'train_runtime': '8997', 'train_tokens_per_second': '1977'} +{'loss': '0.3311', 'grad_norm': '0.8223', 'learning_rate': '4.993e-05', 'epoch': '0.2188', 'num_input_tokens_seen': 17788430, 'train_runtime': '8998', 'train_tokens_per_second': '1977'} +{'loss': '1.184', 'grad_norm': '1.6', 'learning_rate': '4.993e-05', 'epoch': '0.2188', 'num_input_tokens_seen': 17790477, 'train_runtime': '8999', 'train_tokens_per_second': '1977'} +{'loss': '0.9286', 'grad_norm': '1.368', 'learning_rate': '4.993e-05', 'epoch': '0.2188', 'num_input_tokens_seen': 17792524, 'train_runtime': '9000', 'train_tokens_per_second': '1977'} +{'loss': '0.3715', 'grad_norm': '0.8977', 'learning_rate': '4.993e-05', 'epoch': '0.2189', 'num_input_tokens_seen': 17794571, 'train_runtime': '9001', 'train_tokens_per_second': '1977'} +{'loss': '0.7419', 'grad_norm': '1.895', 'learning_rate': '4.993e-05', 'epoch': '0.2189', 'num_input_tokens_seen': 17796618, 'train_runtime': '9003', 'train_tokens_per_second': '1977'} +{'loss': '1.107', 'grad_norm': '1.664', 'learning_rate': '4.993e-05', 'epoch': '0.2189', 'num_input_tokens_seen': 17798665, 'train_runtime': '9004', 'train_tokens_per_second': '1977'} +{'loss': '1.622', 'grad_norm': '2.167', 'learning_rate': '4.993e-05', 'epoch': '0.2189', 'num_input_tokens_seen': 17800712, 'train_runtime': '9005', 'train_tokens_per_second': '1977'} +{'loss': '1.107', 'grad_norm': '1.128', 'learning_rate': '4.993e-05', 'epoch': '0.219', 'num_input_tokens_seen': 17802759, 'train_runtime': '9006', 'train_tokens_per_second': '1977'} +{'loss': '0.8181', 'grad_norm': '1.103', 'learning_rate': '4.993e-05', 'epoch': '0.219', 'num_input_tokens_seen': 17804806, 'train_runtime': '9007', 'train_tokens_per_second': '1977'} +{'loss': '1.43', 'grad_norm': '2.206', 'learning_rate': '4.993e-05', 'epoch': '0.219', 'num_input_tokens_seen': 17806853, 'train_runtime': '9008', 'train_tokens_per_second': '1977'} +{'loss': '0.4204', 'grad_norm': '0.9303', 'learning_rate': '4.993e-05', 'epoch': '0.219', 'num_input_tokens_seen': 17808900, 'train_runtime': '9009', 'train_tokens_per_second': '1977'} +{'loss': '1.04', 'grad_norm': '1.984', 'learning_rate': '4.993e-05', 'epoch': '0.2191', 'num_input_tokens_seen': 17810947, 'train_runtime': '9010', 'train_tokens_per_second': '1977'} +{'loss': '0.7689', 'grad_norm': '1.266', 'learning_rate': '4.993e-05', 'epoch': '0.2191', 'num_input_tokens_seen': 17812994, 'train_runtime': '9011', 'train_tokens_per_second': '1977'} +{'loss': '0.6169', 'grad_norm': '1.42', 'learning_rate': '4.993e-05', 'epoch': '0.2191', 'num_input_tokens_seen': 17815041, 'train_runtime': '9012', 'train_tokens_per_second': '1977'} +{'loss': '1.767', 'grad_norm': '2.721', 'learning_rate': '4.993e-05', 'epoch': '0.2192', 'num_input_tokens_seen': 17817088, 'train_runtime': '9013', 'train_tokens_per_second': '1977'} +{'loss': '0.7683', 'grad_norm': '1.243', 'learning_rate': '4.993e-05', 'epoch': '0.2192', 'num_input_tokens_seen': 17819135, 'train_runtime': '9014', 'train_tokens_per_second': '1977'} +{'loss': '0.376', 'grad_norm': '0.9843', 'learning_rate': '4.993e-05', 'epoch': '0.2192', 'num_input_tokens_seen': 17821182, 'train_runtime': '9015', 'train_tokens_per_second': '1977'} +{'loss': '0.4702', 'grad_norm': '1.082', 'learning_rate': '4.993e-05', 'epoch': '0.2192', 'num_input_tokens_seen': 17823229, 'train_runtime': '9016', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '2.356', 'learning_rate': '4.993e-05', 'epoch': '0.2193', 'num_input_tokens_seen': 17825276, 'train_runtime': '9017', 'train_tokens_per_second': '1977'} +{'loss': '1.006', 'grad_norm': '1.529', 'learning_rate': '4.993e-05', 'epoch': '0.2193', 'num_input_tokens_seen': 17827323, 'train_runtime': '9018', 'train_tokens_per_second': '1977'} +{'loss': '0.7962', 'grad_norm': '1.191', 'learning_rate': '4.993e-05', 'epoch': '0.2193', 'num_input_tokens_seen': 17829370, 'train_runtime': '9019', 'train_tokens_per_second': '1977'} +{'loss': '0.9246', 'grad_norm': '1.716', 'learning_rate': '4.993e-05', 'epoch': '0.2193', 'num_input_tokens_seen': 17831417, 'train_runtime': '9020', 'train_tokens_per_second': '1977'} +{'loss': '0.3776', 'grad_norm': '0.777', 'learning_rate': '4.993e-05', 'epoch': '0.2194', 'num_input_tokens_seen': 17833464, 'train_runtime': '9021', 'train_tokens_per_second': '1977'} +{'loss': '0.877', 'grad_norm': '1.418', 'learning_rate': '4.993e-05', 'epoch': '0.2194', 'num_input_tokens_seen': 17835511, 'train_runtime': '9022', 'train_tokens_per_second': '1977'} +{'loss': '1.981', 'grad_norm': '2.428', 'learning_rate': '4.993e-05', 'epoch': '0.2194', 'num_input_tokens_seen': 17837558, 'train_runtime': '9023', 'train_tokens_per_second': '1977'} +{'loss': '0.9222', 'grad_norm': '1.618', 'learning_rate': '4.993e-05', 'epoch': '0.2194', 'num_input_tokens_seen': 17839605, 'train_runtime': '9024', 'train_tokens_per_second': '1977'} +{'loss': '0.7166', 'grad_norm': '1.244', 'learning_rate': '4.993e-05', 'epoch': '0.2195', 'num_input_tokens_seen': 17841652, 'train_runtime': '9025', 'train_tokens_per_second': '1977'} +{'loss': '0.299', 'grad_norm': '0.8703', 'learning_rate': '4.993e-05', 'epoch': '0.2195', 'num_input_tokens_seen': 17843699, 'train_runtime': '9026', 'train_tokens_per_second': '1977'} +{'loss': '1.954', 'grad_norm': '2.274', 'learning_rate': '4.993e-05', 'epoch': '0.2195', 'num_input_tokens_seen': 17845746, 'train_runtime': '9027', 'train_tokens_per_second': '1977'} +{'loss': '0.9662', 'grad_norm': '1.739', 'learning_rate': '4.993e-05', 'epoch': '0.2195', 'num_input_tokens_seen': 17847793, 'train_runtime': '9028', 'train_tokens_per_second': '1977'} +{'loss': '0.5421', 'grad_norm': '1.11', 'learning_rate': '4.993e-05', 'epoch': '0.2196', 'num_input_tokens_seen': 17849840, 'train_runtime': '9029', 'train_tokens_per_second': '1977'} +{'loss': '0.4517', 'grad_norm': '0.9556', 'learning_rate': '4.993e-05', 'epoch': '0.2196', 'num_input_tokens_seen': 17851887, 'train_runtime': '9031', 'train_tokens_per_second': '1977'} +{'loss': '0.5153', 'grad_norm': '1.382', 'learning_rate': '4.993e-05', 'epoch': '0.2196', 'num_input_tokens_seen': 17853934, 'train_runtime': '9032', 'train_tokens_per_second': '1977'} +{'loss': '0.2739', 'grad_norm': '0.9725', 'learning_rate': '4.993e-05', 'epoch': '0.2196', 'num_input_tokens_seen': 17855981, 'train_runtime': '9033', 'train_tokens_per_second': '1977'} +{'loss': '0.2926', 'grad_norm': '0.9782', 'learning_rate': '4.993e-05', 'epoch': '0.2197', 'num_input_tokens_seen': 17858028, 'train_runtime': '9034', 'train_tokens_per_second': '1977'} +{'loss': '0.4619', 'grad_norm': '0.9854', 'learning_rate': '4.993e-05', 'epoch': '0.2197', 'num_input_tokens_seen': 17860075, 'train_runtime': '9035', 'train_tokens_per_second': '1977'} +{'loss': '0.666', 'grad_norm': '1.043', 'learning_rate': '4.993e-05', 'epoch': '0.2197', 'num_input_tokens_seen': 17862122, 'train_runtime': '9036', 'train_tokens_per_second': '1977'} +{'loss': '0.3611', 'grad_norm': '0.7958', 'learning_rate': '4.993e-05', 'epoch': '0.2197', 'num_input_tokens_seen': 17864169, 'train_runtime': '9037', 'train_tokens_per_second': '1977'} +{'loss': '0.8013', 'grad_norm': '1.98', 'learning_rate': '4.993e-05', 'epoch': '0.2198', 'num_input_tokens_seen': 17866216, 'train_runtime': '9038', 'train_tokens_per_second': '1977'} +{'loss': '1.042', 'grad_norm': '2.025', 'learning_rate': '4.993e-05', 'epoch': '0.2198', 'num_input_tokens_seen': 17868263, 'train_runtime': '9039', 'train_tokens_per_second': '1977'} +{'loss': '0.7807', 'grad_norm': '1.16', 'learning_rate': '4.993e-05', 'epoch': '0.2198', 'num_input_tokens_seen': 17870310, 'train_runtime': '9040', 'train_tokens_per_second': '1977'} +{'loss': '1.298', 'grad_norm': '2.266', 'learning_rate': '4.993e-05', 'epoch': '0.2198', 'num_input_tokens_seen': 17872357, 'train_runtime': '9041', 'train_tokens_per_second': '1977'} +{'loss': '1.147', 'grad_norm': '1.773', 'learning_rate': '4.993e-05', 'epoch': '0.2199', 'num_input_tokens_seen': 17874404, 'train_runtime': '9042', 'train_tokens_per_second': '1977'} +{'loss': '0.2551', 'grad_norm': '0.7403', 'learning_rate': '4.993e-05', 'epoch': '0.2199', 'num_input_tokens_seen': 17876451, 'train_runtime': '9043', 'train_tokens_per_second': '1977'} +{'loss': '0.4216', 'grad_norm': '1.185', 'learning_rate': '4.993e-05', 'epoch': '0.2199', 'num_input_tokens_seen': 17878498, 'train_runtime': '9044', 'train_tokens_per_second': '1977'} +{'loss': '0.8445', 'grad_norm': '1.597', 'learning_rate': '4.993e-05', 'epoch': '0.2199', 'num_input_tokens_seen': 17880545, 'train_runtime': '9045', 'train_tokens_per_second': '1977'} +{'loss': '0.382', 'grad_norm': '0.7851', 'learning_rate': '4.993e-05', 'epoch': '0.22', 'num_input_tokens_seen': 17882592, 'train_runtime': '9046', 'train_tokens_per_second': '1977'} +{'loss': '0.3145', 'grad_norm': '0.8872', 'learning_rate': '4.993e-05', 'epoch': '0.22', 'num_input_tokens_seen': 17884639, 'train_runtime': '9047', 'train_tokens_per_second': '1977'} +{'loss': '0.8874', 'grad_norm': '1.384', 'learning_rate': '4.993e-05', 'epoch': '0.22', 'num_input_tokens_seen': 17886686, 'train_runtime': '9048', 'train_tokens_per_second': '1977'} +{'loss': '0.5186', 'grad_norm': '1.034', 'learning_rate': '4.993e-05', 'epoch': '0.22', 'num_input_tokens_seen': 17888733, 'train_runtime': '9049', 'train_tokens_per_second': '1977'} +{'loss': '1.096', 'grad_norm': '1.879', 'learning_rate': '4.993e-05', 'epoch': '0.2201', 'num_input_tokens_seen': 17890780, 'train_runtime': '9050', 'train_tokens_per_second': '1977'} +{'loss': '1.107', 'grad_norm': '1.336', 'learning_rate': '4.993e-05', 'epoch': '0.2201', 'num_input_tokens_seen': 17892827, 'train_runtime': '9051', 'train_tokens_per_second': '1977'} +{'loss': '0.3334', 'grad_norm': '1.021', 'learning_rate': '4.993e-05', 'epoch': '0.2201', 'num_input_tokens_seen': 17894874, 'train_runtime': '9052', 'train_tokens_per_second': '1977'} +{'loss': '0.5396', 'grad_norm': '1.267', 'learning_rate': '4.993e-05', 'epoch': '0.2201', 'num_input_tokens_seen': 17896921, 'train_runtime': '9053', 'train_tokens_per_second': '1977'} +{'loss': '1.554', 'grad_norm': '2.4', 'learning_rate': '4.993e-05', 'epoch': '0.2202', 'num_input_tokens_seen': 17898968, 'train_runtime': '9054', 'train_tokens_per_second': '1977'} +{'loss': '0.7265', 'grad_norm': '1.315', 'learning_rate': '4.993e-05', 'epoch': '0.2202', 'num_input_tokens_seen': 17901015, 'train_runtime': '9055', 'train_tokens_per_second': '1977'} +{'loss': '0.5264', 'grad_norm': '1.415', 'learning_rate': '4.993e-05', 'epoch': '0.2202', 'num_input_tokens_seen': 17903062, 'train_runtime': '9056', 'train_tokens_per_second': '1977'} +{'loss': '0.935', 'grad_norm': '1.37', 'learning_rate': '4.993e-05', 'epoch': '0.2202', 'num_input_tokens_seen': 17905109, 'train_runtime': '9057', 'train_tokens_per_second': '1977'} +{'loss': '0.4072', 'grad_norm': '1.087', 'learning_rate': '4.993e-05', 'epoch': '0.2203', 'num_input_tokens_seen': 17907156, 'train_runtime': '9058', 'train_tokens_per_second': '1977'} +{'loss': '0.9571', 'grad_norm': '1.697', 'learning_rate': '4.993e-05', 'epoch': '0.2203', 'num_input_tokens_seen': 17909203, 'train_runtime': '9060', 'train_tokens_per_second': '1977'} +{'loss': '0.5157', 'grad_norm': '1.159', 'learning_rate': '4.993e-05', 'epoch': '0.2203', 'num_input_tokens_seen': 17911250, 'train_runtime': '9061', 'train_tokens_per_second': '1977'} +{'loss': '0.5666', 'grad_norm': '1.4', 'learning_rate': '4.993e-05', 'epoch': '0.2203', 'num_input_tokens_seen': 17913297, 'train_runtime': '9062', 'train_tokens_per_second': '1977'} +{'loss': '0.8146', 'grad_norm': '1.239', 'learning_rate': '4.993e-05', 'epoch': '0.2204', 'num_input_tokens_seen': 17915344, 'train_runtime': '9063', 'train_tokens_per_second': '1977'} +{'loss': '0.9966', 'grad_norm': '1.52', 'learning_rate': '4.993e-05', 'epoch': '0.2204', 'num_input_tokens_seen': 17917391, 'train_runtime': '9064', 'train_tokens_per_second': '1977'} +{'loss': '0.3757', 'grad_norm': '0.9304', 'learning_rate': '4.993e-05', 'epoch': '0.2204', 'num_input_tokens_seen': 17919438, 'train_runtime': '9065', 'train_tokens_per_second': '1977'} +{'loss': '0.354', 'grad_norm': '0.8699', 'learning_rate': '4.993e-05', 'epoch': '0.2204', 'num_input_tokens_seen': 17921485, 'train_runtime': '9066', 'train_tokens_per_second': '1977'} +{'loss': '1.425', 'grad_norm': '2.097', 'learning_rate': '4.993e-05', 'epoch': '0.2205', 'num_input_tokens_seen': 17923532, 'train_runtime': '9067', 'train_tokens_per_second': '1977'} +{'loss': '1.284', 'grad_norm': '1.773', 'learning_rate': '4.993e-05', 'epoch': '0.2205', 'num_input_tokens_seen': 17925579, 'train_runtime': '9068', 'train_tokens_per_second': '1977'} +{'loss': '0.472', 'grad_norm': '1.1', 'learning_rate': '4.993e-05', 'epoch': '0.2205', 'num_input_tokens_seen': 17927626, 'train_runtime': '9069', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '1.746', 'learning_rate': '4.993e-05', 'epoch': '0.2205', 'num_input_tokens_seen': 17929673, 'train_runtime': '9070', 'train_tokens_per_second': '1977'} +{'loss': '1.769', 'grad_norm': '2.268', 'learning_rate': '4.993e-05', 'epoch': '0.2206', 'num_input_tokens_seen': 17931720, 'train_runtime': '9071', 'train_tokens_per_second': '1977'} +{'loss': '2.084', 'grad_norm': '2.71', 'learning_rate': '4.993e-05', 'epoch': '0.2206', 'num_input_tokens_seen': 17933767, 'train_runtime': '9072', 'train_tokens_per_second': '1977'} +{'loss': '1.08', 'grad_norm': '1.308', 'learning_rate': '4.993e-05', 'epoch': '0.2206', 'num_input_tokens_seen': 17935814, 'train_runtime': '9073', 'train_tokens_per_second': '1977'} +{'loss': '1.3', 'grad_norm': '1.824', 'learning_rate': '4.993e-05', 'epoch': '0.2206', 'num_input_tokens_seen': 17937861, 'train_runtime': '9074', 'train_tokens_per_second': '1977'} +{'loss': '0.2769', 'grad_norm': '0.8885', 'learning_rate': '4.993e-05', 'epoch': '0.2207', 'num_input_tokens_seen': 17939908, 'train_runtime': '9075', 'train_tokens_per_second': '1977'} +{'loss': '0.5012', 'grad_norm': '1.071', 'learning_rate': '4.993e-05', 'epoch': '0.2207', 'num_input_tokens_seen': 17941955, 'train_runtime': '9076', 'train_tokens_per_second': '1977'} +{'loss': '1.26', 'grad_norm': '2.294', 'learning_rate': '4.993e-05', 'epoch': '0.2207', 'num_input_tokens_seen': 17944002, 'train_runtime': '9077', 'train_tokens_per_second': '1977'} +{'loss': '0.5605', 'grad_norm': '1.295', 'learning_rate': '4.993e-05', 'epoch': '0.2207', 'num_input_tokens_seen': 17946049, 'train_runtime': '9078', 'train_tokens_per_second': '1977'} +{'loss': '0.6823', 'grad_norm': '1.433', 'learning_rate': '4.993e-05', 'epoch': '0.2208', 'num_input_tokens_seen': 17948096, 'train_runtime': '9079', 'train_tokens_per_second': '1977'} +{'loss': '0.4416', 'grad_norm': '0.9546', 'learning_rate': '4.993e-05', 'epoch': '0.2208', 'num_input_tokens_seen': 17950143, 'train_runtime': '9080', 'train_tokens_per_second': '1977'} +{'loss': '0.971', 'grad_norm': '1.728', 'learning_rate': '4.993e-05', 'epoch': '0.2208', 'num_input_tokens_seen': 17952190, 'train_runtime': '9081', 'train_tokens_per_second': '1977'} +{'loss': '0.3452', 'grad_norm': '0.9308', 'learning_rate': '4.993e-05', 'epoch': '0.2208', 'num_input_tokens_seen': 17954237, 'train_runtime': '9082', 'train_tokens_per_second': '1977'} +{'loss': '0.5138', 'grad_norm': '1.233', 'learning_rate': '4.993e-05', 'epoch': '0.2209', 'num_input_tokens_seen': 17956284, 'train_runtime': '9083', 'train_tokens_per_second': '1977'} +{'loss': '1.335', 'grad_norm': '2.125', 'learning_rate': '4.992e-05', 'epoch': '0.2209', 'num_input_tokens_seen': 17958331, 'train_runtime': '9084', 'train_tokens_per_second': '1977'} +{'loss': '1.123', 'grad_norm': '1.183', 'learning_rate': '4.992e-05', 'epoch': '0.2209', 'num_input_tokens_seen': 17960378, 'train_runtime': '9085', 'train_tokens_per_second': '1977'} +{'loss': '0.3732', 'grad_norm': '1.071', 'learning_rate': '4.992e-05', 'epoch': '0.2209', 'num_input_tokens_seen': 17962425, 'train_runtime': '9086', 'train_tokens_per_second': '1977'} +{'loss': '0.6377', 'grad_norm': '1.335', 'learning_rate': '4.992e-05', 'epoch': '0.221', 'num_input_tokens_seen': 17964472, 'train_runtime': '9088', 'train_tokens_per_second': '1977'} +{'loss': '1.297', 'grad_norm': '1.872', 'learning_rate': '4.992e-05', 'epoch': '0.221', 'num_input_tokens_seen': 17966519, 'train_runtime': '9089', 'train_tokens_per_second': '1977'} +{'loss': '0.8125', 'grad_norm': '1.728', 'learning_rate': '4.992e-05', 'epoch': '0.221', 'num_input_tokens_seen': 17968566, 'train_runtime': '9090', 'train_tokens_per_second': '1977'} +{'loss': '0.6257', 'grad_norm': '1.233', 'learning_rate': '4.992e-05', 'epoch': '0.221', 'num_input_tokens_seen': 17970613, 'train_runtime': '9091', 'train_tokens_per_second': '1977'} +{'loss': '0.865', 'grad_norm': '2.053', 'learning_rate': '4.992e-05', 'epoch': '0.2211', 'num_input_tokens_seen': 17972660, 'train_runtime': '9092', 'train_tokens_per_second': '1977'} +{'loss': '0.9393', 'grad_norm': '1.354', 'learning_rate': '4.992e-05', 'epoch': '0.2211', 'num_input_tokens_seen': 17974707, 'train_runtime': '9093', 'train_tokens_per_second': '1977'} +{'loss': '1.484', 'grad_norm': '2.578', 'learning_rate': '4.992e-05', 'epoch': '0.2211', 'num_input_tokens_seen': 17976754, 'train_runtime': '9094', 'train_tokens_per_second': '1977'} +{'loss': '0.4387', 'grad_norm': '1.17', 'learning_rate': '4.992e-05', 'epoch': '0.2211', 'num_input_tokens_seen': 17978801, 'train_runtime': '9095', 'train_tokens_per_second': '1977'} +{'loss': '0.5034', 'grad_norm': '1.365', 'learning_rate': '4.992e-05', 'epoch': '0.2212', 'num_input_tokens_seen': 17980848, 'train_runtime': '9096', 'train_tokens_per_second': '1977'} +{'loss': '0.7132', 'grad_norm': '1.037', 'learning_rate': '4.992e-05', 'epoch': '0.2212', 'num_input_tokens_seen': 17982895, 'train_runtime': '9097', 'train_tokens_per_second': '1977'} +{'loss': '0.9478', 'grad_norm': '1.699', 'learning_rate': '4.992e-05', 'epoch': '0.2212', 'num_input_tokens_seen': 17984942, 'train_runtime': '9098', 'train_tokens_per_second': '1977'} +{'loss': '1.012', 'grad_norm': '1.696', 'learning_rate': '4.992e-05', 'epoch': '0.2212', 'num_input_tokens_seen': 17986989, 'train_runtime': '9099', 'train_tokens_per_second': '1977'} +{'loss': '0.5103', 'grad_norm': '1.244', 'learning_rate': '4.992e-05', 'epoch': '0.2213', 'num_input_tokens_seen': 17989036, 'train_runtime': '9100', 'train_tokens_per_second': '1977'} +{'loss': '0.9935', 'grad_norm': '1.548', 'learning_rate': '4.992e-05', 'epoch': '0.2213', 'num_input_tokens_seen': 17991083, 'train_runtime': '9101', 'train_tokens_per_second': '1977'} +{'loss': '0.5635', 'grad_norm': '1.467', 'learning_rate': '4.992e-05', 'epoch': '0.2213', 'num_input_tokens_seen': 17993130, 'train_runtime': '9102', 'train_tokens_per_second': '1977'} +{'loss': '0.7891', 'grad_norm': '1.214', 'learning_rate': '4.992e-05', 'epoch': '0.2213', 'num_input_tokens_seen': 17995177, 'train_runtime': '9103', 'train_tokens_per_second': '1977'} +{'loss': '0.3384', 'grad_norm': '0.9386', 'learning_rate': '4.992e-05', 'epoch': '0.2214', 'num_input_tokens_seen': 17997224, 'train_runtime': '9104', 'train_tokens_per_second': '1977'} +{'loss': '0.7897', 'grad_norm': '1.312', 'learning_rate': '4.992e-05', 'epoch': '0.2214', 'num_input_tokens_seen': 17999271, 'train_runtime': '9105', 'train_tokens_per_second': '1977'} +{'loss': '0.446', 'grad_norm': '1.178', 'learning_rate': '4.992e-05', 'epoch': '0.2214', 'num_input_tokens_seen': 18001318, 'train_runtime': '9106', 'train_tokens_per_second': '1977'} +{'loss': '0.5025', 'grad_norm': '1.231', 'learning_rate': '4.992e-05', 'epoch': '0.2214', 'num_input_tokens_seen': 18003365, 'train_runtime': '9107', 'train_tokens_per_second': '1977'} +{'loss': '0.6075', 'grad_norm': '1.158', 'learning_rate': '4.992e-05', 'epoch': '0.2215', 'num_input_tokens_seen': 18005412, 'train_runtime': '9108', 'train_tokens_per_second': '1977'} +{'loss': '0.3987', 'grad_norm': '1.007', 'learning_rate': '4.992e-05', 'epoch': '0.2215', 'num_input_tokens_seen': 18007459, 'train_runtime': '9109', 'train_tokens_per_second': '1977'} +{'loss': '0.3843', 'grad_norm': '0.9334', 'learning_rate': '4.992e-05', 'epoch': '0.2215', 'num_input_tokens_seen': 18009506, 'train_runtime': '9110', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '1.459', 'learning_rate': '4.992e-05', 'epoch': '0.2215', 'num_input_tokens_seen': 18011553, 'train_runtime': '9111', 'train_tokens_per_second': '1977'} +{'loss': '0.8435', 'grad_norm': '1.748', 'learning_rate': '4.992e-05', 'epoch': '0.2216', 'num_input_tokens_seen': 18013600, 'train_runtime': '9112', 'train_tokens_per_second': '1977'} +{'loss': '0.6104', 'grad_norm': '1.369', 'learning_rate': '4.992e-05', 'epoch': '0.2216', 'num_input_tokens_seen': 18015647, 'train_runtime': '9113', 'train_tokens_per_second': '1977'} +{'loss': '0.7828', 'grad_norm': '1.388', 'learning_rate': '4.992e-05', 'epoch': '0.2216', 'num_input_tokens_seen': 18017694, 'train_runtime': '9115', 'train_tokens_per_second': '1977'} +{'loss': '0.3096', 'grad_norm': '0.8385', 'learning_rate': '4.992e-05', 'epoch': '0.2216', 'num_input_tokens_seen': 18019741, 'train_runtime': '9116', 'train_tokens_per_second': '1977'} +{'loss': '1', 'grad_norm': '1.507', 'learning_rate': '4.992e-05', 'epoch': '0.2217', 'num_input_tokens_seen': 18021788, 'train_runtime': '9117', 'train_tokens_per_second': '1977'} +{'loss': '0.7968', 'grad_norm': '1.313', 'learning_rate': '4.992e-05', 'epoch': '0.2217', 'num_input_tokens_seen': 18023835, 'train_runtime': '9118', 'train_tokens_per_second': '1977'} +{'loss': '0.3865', 'grad_norm': '0.8623', 'learning_rate': '4.992e-05', 'epoch': '0.2217', 'num_input_tokens_seen': 18025882, 'train_runtime': '9119', 'train_tokens_per_second': '1977'} +{'loss': '0.7958', 'grad_norm': '1.639', 'learning_rate': '4.992e-05', 'epoch': '0.2217', 'num_input_tokens_seen': 18027929, 'train_runtime': '9120', 'train_tokens_per_second': '1977'} +{'loss': '0.8777', 'grad_norm': '1.886', 'learning_rate': '4.992e-05', 'epoch': '0.2218', 'num_input_tokens_seen': 18029976, 'train_runtime': '9121', 'train_tokens_per_second': '1977'} +{'loss': '1.682', 'grad_norm': '1.984', 'learning_rate': '4.992e-05', 'epoch': '0.2218', 'num_input_tokens_seen': 18032023, 'train_runtime': '9122', 'train_tokens_per_second': '1977'} +{'loss': '1.713', 'grad_norm': '1.972', 'learning_rate': '4.992e-05', 'epoch': '0.2218', 'num_input_tokens_seen': 18034070, 'train_runtime': '9123', 'train_tokens_per_second': '1977'} +{'loss': '0.974', 'grad_norm': '1.473', 'learning_rate': '4.992e-05', 'epoch': '0.2218', 'num_input_tokens_seen': 18036117, 'train_runtime': '9124', 'train_tokens_per_second': '1977'} +{'loss': '1.33', 'grad_norm': '2.119', 'learning_rate': '4.992e-05', 'epoch': '0.2219', 'num_input_tokens_seen': 18038164, 'train_runtime': '9125', 'train_tokens_per_second': '1977'} +{'loss': '0.243', 'grad_norm': '0.8224', 'learning_rate': '4.992e-05', 'epoch': '0.2219', 'num_input_tokens_seen': 18040211, 'train_runtime': '9126', 'train_tokens_per_second': '1977'} +{'loss': '0.8694', 'grad_norm': '1.31', 'learning_rate': '4.992e-05', 'epoch': '0.2219', 'num_input_tokens_seen': 18042258, 'train_runtime': '9127', 'train_tokens_per_second': '1977'} +{'loss': '0.7271', 'grad_norm': '1.244', 'learning_rate': '4.992e-05', 'epoch': '0.2219', 'num_input_tokens_seen': 18044305, 'train_runtime': '9128', 'train_tokens_per_second': '1977'} +{'loss': '0.2677', 'grad_norm': '0.8831', 'learning_rate': '4.992e-05', 'epoch': '0.222', 'num_input_tokens_seen': 18046352, 'train_runtime': '9129', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '2.426', 'learning_rate': '4.992e-05', 'epoch': '0.222', 'num_input_tokens_seen': 18048399, 'train_runtime': '9130', 'train_tokens_per_second': '1977'} +{'loss': '0.469', 'grad_norm': '1.269', 'learning_rate': '4.992e-05', 'epoch': '0.222', 'num_input_tokens_seen': 18050446, 'train_runtime': '9131', 'train_tokens_per_second': '1977'} +{'loss': '0.2323', 'grad_norm': '0.8137', 'learning_rate': '4.992e-05', 'epoch': '0.222', 'num_input_tokens_seen': 18052493, 'train_runtime': '9132', 'train_tokens_per_second': '1977'} +{'loss': '0.8235', 'grad_norm': '1.433', 'learning_rate': '4.992e-05', 'epoch': '0.2221', 'num_input_tokens_seen': 18054540, 'train_runtime': '9133', 'train_tokens_per_second': '1977'} +{'loss': '0.635', 'grad_norm': '1.297', 'learning_rate': '4.992e-05', 'epoch': '0.2221', 'num_input_tokens_seen': 18056587, 'train_runtime': '9134', 'train_tokens_per_second': '1977'} +{'loss': '0.6968', 'grad_norm': '1.4', 'learning_rate': '4.992e-05', 'epoch': '0.2221', 'num_input_tokens_seen': 18058634, 'train_runtime': '9135', 'train_tokens_per_second': '1977'} +{'loss': '1.081', 'grad_norm': '1.472', 'learning_rate': '4.992e-05', 'epoch': '0.2221', 'num_input_tokens_seen': 18060681, 'train_runtime': '9136', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '1.778', 'learning_rate': '4.992e-05', 'epoch': '0.2222', 'num_input_tokens_seen': 18062728, 'train_runtime': '9137', 'train_tokens_per_second': '1977'} +{'loss': '0.2938', 'grad_norm': '0.7924', 'learning_rate': '4.992e-05', 'epoch': '0.2222', 'num_input_tokens_seen': 18064775, 'train_runtime': '9138', 'train_tokens_per_second': '1977'} +{'loss': '1.185', 'grad_norm': '2.137', 'learning_rate': '4.992e-05', 'epoch': '0.2222', 'num_input_tokens_seen': 18066822, 'train_runtime': '9139', 'train_tokens_per_second': '1977'} +{'loss': '0.872', 'grad_norm': '1.538', 'learning_rate': '4.992e-05', 'epoch': '0.2222', 'num_input_tokens_seen': 18068869, 'train_runtime': '9140', 'train_tokens_per_second': '1977'} +{'loss': '0.9683', 'grad_norm': '1.487', 'learning_rate': '4.992e-05', 'epoch': '0.2223', 'num_input_tokens_seen': 18070916, 'train_runtime': '9141', 'train_tokens_per_second': '1977'} +{'loss': '0.2764', 'grad_norm': '0.8923', 'learning_rate': '4.992e-05', 'epoch': '0.2223', 'num_input_tokens_seen': 18072963, 'train_runtime': '9143', 'train_tokens_per_second': '1977'} +{'loss': '0.44', 'grad_norm': '0.9452', 'learning_rate': '4.992e-05', 'epoch': '0.2223', 'num_input_tokens_seen': 18075010, 'train_runtime': '9144', 'train_tokens_per_second': '1977'} +{'loss': '0.9637', 'grad_norm': '1.951', 'learning_rate': '4.992e-05', 'epoch': '0.2223', 'num_input_tokens_seen': 18077057, 'train_runtime': '9145', 'train_tokens_per_second': '1977'} +{'loss': '1.151', 'grad_norm': '1.428', 'learning_rate': '4.992e-05', 'epoch': '0.2224', 'num_input_tokens_seen': 18079104, 'train_runtime': '9146', 'train_tokens_per_second': '1977'} +{'loss': '1.238', 'grad_norm': '2.502', 'learning_rate': '4.992e-05', 'epoch': '0.2224', 'num_input_tokens_seen': 18081151, 'train_runtime': '9147', 'train_tokens_per_second': '1977'} +{'loss': '0.9125', 'grad_norm': '1.547', 'learning_rate': '4.992e-05', 'epoch': '0.2224', 'num_input_tokens_seen': 18083198, 'train_runtime': '9148', 'train_tokens_per_second': '1977'} +{'loss': '0.7401', 'grad_norm': '1.462', 'learning_rate': '4.992e-05', 'epoch': '0.2224', 'num_input_tokens_seen': 18085245, 'train_runtime': '9149', 'train_tokens_per_second': '1977'} +{'loss': '0.6234', 'grad_norm': '1.122', 'learning_rate': '4.992e-05', 'epoch': '0.2225', 'num_input_tokens_seen': 18087292, 'train_runtime': '9150', 'train_tokens_per_second': '1977'} +{'loss': '0.7923', 'grad_norm': '1.598', 'learning_rate': '4.992e-05', 'epoch': '0.2225', 'num_input_tokens_seen': 18089339, 'train_runtime': '9151', 'train_tokens_per_second': '1977'} +{'loss': '0.5836', 'grad_norm': '1.402', 'learning_rate': '4.992e-05', 'epoch': '0.2225', 'num_input_tokens_seen': 18091386, 'train_runtime': '9152', 'train_tokens_per_second': '1977'} +{'loss': '0.4785', 'grad_norm': '1.274', 'learning_rate': '4.992e-05', 'epoch': '0.2225', 'num_input_tokens_seen': 18093433, 'train_runtime': '9153', 'train_tokens_per_second': '1977'} +{'loss': '0.8724', 'grad_norm': '1.441', 'learning_rate': '4.992e-05', 'epoch': '0.2226', 'num_input_tokens_seen': 18095480, 'train_runtime': '9154', 'train_tokens_per_second': '1977'} +{'loss': '0.9102', 'grad_norm': '1.299', 'learning_rate': '4.992e-05', 'epoch': '0.2226', 'num_input_tokens_seen': 18097527, 'train_runtime': '9155', 'train_tokens_per_second': '1977'} +{'loss': '1.601', 'grad_norm': '2.33', 'learning_rate': '4.992e-05', 'epoch': '0.2226', 'num_input_tokens_seen': 18099574, 'train_runtime': '9156', 'train_tokens_per_second': '1977'} +{'loss': '0.3702', 'grad_norm': '1.037', 'learning_rate': '4.992e-05', 'epoch': '0.2227', 'num_input_tokens_seen': 18101621, 'train_runtime': '9157', 'train_tokens_per_second': '1977'} +{'loss': '0.6234', 'grad_norm': '1.189', 'learning_rate': '4.992e-05', 'epoch': '0.2227', 'num_input_tokens_seen': 18103668, 'train_runtime': '9158', 'train_tokens_per_second': '1977'} +{'loss': '0.3564', 'grad_norm': '1.004', 'learning_rate': '4.992e-05', 'epoch': '0.2227', 'num_input_tokens_seen': 18105715, 'train_runtime': '9159', 'train_tokens_per_second': '1977'} +{'loss': '0.3982', 'grad_norm': '1.067', 'learning_rate': '4.992e-05', 'epoch': '0.2227', 'num_input_tokens_seen': 18107762, 'train_runtime': '9160', 'train_tokens_per_second': '1977'} +{'loss': '0.7774', 'grad_norm': '1.376', 'learning_rate': '4.992e-05', 'epoch': '0.2228', 'num_input_tokens_seen': 18109809, 'train_runtime': '9161', 'train_tokens_per_second': '1977'} +{'loss': '0.6778', 'grad_norm': '1.704', 'learning_rate': '4.992e-05', 'epoch': '0.2228', 'num_input_tokens_seen': 18111856, 'train_runtime': '9162', 'train_tokens_per_second': '1977'} +{'loss': '0.4579', 'grad_norm': '1.151', 'learning_rate': '4.992e-05', 'epoch': '0.2228', 'num_input_tokens_seen': 18113903, 'train_runtime': '9163', 'train_tokens_per_second': '1977'} +{'loss': '1.315', 'grad_norm': '1.359', 'learning_rate': '4.992e-05', 'epoch': '0.2228', 'num_input_tokens_seen': 18115950, 'train_runtime': '9164', 'train_tokens_per_second': '1977'} +{'loss': '0.6916', 'grad_norm': '1.233', 'learning_rate': '4.992e-05', 'epoch': '0.2229', 'num_input_tokens_seen': 18117997, 'train_runtime': '9165', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.142', 'learning_rate': '4.992e-05', 'epoch': '0.2229', 'num_input_tokens_seen': 18120044, 'train_runtime': '9166', 'train_tokens_per_second': '1977'} +{'loss': '0.7891', 'grad_norm': '1.273', 'learning_rate': '4.992e-05', 'epoch': '0.2229', 'num_input_tokens_seen': 18122091, 'train_runtime': '9167', 'train_tokens_per_second': '1977'} +{'loss': '0.7745', 'grad_norm': '1.585', 'learning_rate': '4.992e-05', 'epoch': '0.2229', 'num_input_tokens_seen': 18124138, 'train_runtime': '9168', 'train_tokens_per_second': '1977'} +{'loss': '1.333', 'grad_norm': '2.007', 'learning_rate': '4.992e-05', 'epoch': '0.223', 'num_input_tokens_seen': 18126185, 'train_runtime': '9169', 'train_tokens_per_second': '1977'} +{'loss': '0.6466', 'grad_norm': '1.437', 'learning_rate': '4.992e-05', 'epoch': '0.223', 'num_input_tokens_seen': 18128232, 'train_runtime': '9170', 'train_tokens_per_second': '1977'} +{'loss': '0.3961', 'grad_norm': '1.221', 'learning_rate': '4.992e-05', 'epoch': '0.223', 'num_input_tokens_seen': 18130279, 'train_runtime': '9172', 'train_tokens_per_second': '1977'} +{'loss': '0.3056', 'grad_norm': '1.067', 'learning_rate': '4.992e-05', 'epoch': '0.223', 'num_input_tokens_seen': 18132326, 'train_runtime': '9173', 'train_tokens_per_second': '1977'} +{'loss': '0.6072', 'grad_norm': '0.9592', 'learning_rate': '4.992e-05', 'epoch': '0.2231', 'num_input_tokens_seen': 18134373, 'train_runtime': '9174', 'train_tokens_per_second': '1977'} +{'loss': '0.3592', 'grad_norm': '0.9874', 'learning_rate': '4.992e-05', 'epoch': '0.2231', 'num_input_tokens_seen': 18136420, 'train_runtime': '9175', 'train_tokens_per_second': '1977'} +{'loss': '0.3895', 'grad_norm': '1.09', 'learning_rate': '4.992e-05', 'epoch': '0.2231', 'num_input_tokens_seen': 18138467, 'train_runtime': '9176', 'train_tokens_per_second': '1977'} +{'loss': '0.4668', 'grad_norm': '1.428', 'learning_rate': '4.992e-05', 'epoch': '0.2231', 'num_input_tokens_seen': 18140514, 'train_runtime': '9177', 'train_tokens_per_second': '1977'} +{'loss': '0.8494', 'grad_norm': '1.522', 'learning_rate': '4.992e-05', 'epoch': '0.2232', 'num_input_tokens_seen': 18142561, 'train_runtime': '9178', 'train_tokens_per_second': '1977'} +{'loss': '0.8018', 'grad_norm': '1.32', 'learning_rate': '4.992e-05', 'epoch': '0.2232', 'num_input_tokens_seen': 18144608, 'train_runtime': '9179', 'train_tokens_per_second': '1977'} +{'loss': '0.3566', 'grad_norm': '0.9338', 'learning_rate': '4.992e-05', 'epoch': '0.2232', 'num_input_tokens_seen': 18146655, 'train_runtime': '9180', 'train_tokens_per_second': '1977'} +{'loss': '0.9467', 'grad_norm': '1.421', 'learning_rate': '4.992e-05', 'epoch': '0.2232', 'num_input_tokens_seen': 18148702, 'train_runtime': '9181', 'train_tokens_per_second': '1977'} +{'loss': '0.2843', 'grad_norm': '0.8282', 'learning_rate': '4.992e-05', 'epoch': '0.2233', 'num_input_tokens_seen': 18150749, 'train_runtime': '9182', 'train_tokens_per_second': '1977'} +{'loss': '0.8453', 'grad_norm': '1.992', 'learning_rate': '4.992e-05', 'epoch': '0.2233', 'num_input_tokens_seen': 18152796, 'train_runtime': '9183', 'train_tokens_per_second': '1977'} +{'loss': '1.109', 'grad_norm': '2.14', 'learning_rate': '4.992e-05', 'epoch': '0.2233', 'num_input_tokens_seen': 18154843, 'train_runtime': '9184', 'train_tokens_per_second': '1977'} +{'loss': '0.3658', 'grad_norm': '1.033', 'learning_rate': '4.992e-05', 'epoch': '0.2233', 'num_input_tokens_seen': 18156890, 'train_runtime': '9185', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '1.926', 'learning_rate': '4.992e-05', 'epoch': '0.2234', 'num_input_tokens_seen': 18158937, 'train_runtime': '9186', 'train_tokens_per_second': '1977'} +{'loss': '0.3149', 'grad_norm': '1.128', 'learning_rate': '4.992e-05', 'epoch': '0.2234', 'num_input_tokens_seen': 18160984, 'train_runtime': '9187', 'train_tokens_per_second': '1977'} +{'loss': '0.645', 'grad_norm': '1.23', 'learning_rate': '4.992e-05', 'epoch': '0.2234', 'num_input_tokens_seen': 18163031, 'train_runtime': '9188', 'train_tokens_per_second': '1977'} +{'loss': '0.5993', 'grad_norm': '1.026', 'learning_rate': '4.992e-05', 'epoch': '0.2234', 'num_input_tokens_seen': 18165078, 'train_runtime': '9189', 'train_tokens_per_second': '1977'} +{'loss': '0.2357', 'grad_norm': '0.9977', 'learning_rate': '4.992e-05', 'epoch': '0.2235', 'num_input_tokens_seen': 18167125, 'train_runtime': '9190', 'train_tokens_per_second': '1977'} +{'loss': '1.197', 'grad_norm': '2.226', 'learning_rate': '4.992e-05', 'epoch': '0.2235', 'num_input_tokens_seen': 18169172, 'train_runtime': '9191', 'train_tokens_per_second': '1977'} +{'loss': '1.305', 'grad_norm': '2.312', 'learning_rate': '4.992e-05', 'epoch': '0.2235', 'num_input_tokens_seen': 18171219, 'train_runtime': '9192', 'train_tokens_per_second': '1977'} +{'loss': '0.342', 'grad_norm': '0.9238', 'learning_rate': '4.992e-05', 'epoch': '0.2235', 'num_input_tokens_seen': 18173266, 'train_runtime': '9193', 'train_tokens_per_second': '1977'} +{'loss': '0.4334', 'grad_norm': '1.014', 'learning_rate': '4.992e-05', 'epoch': '0.2236', 'num_input_tokens_seen': 18175313, 'train_runtime': '9194', 'train_tokens_per_second': '1977'} +{'loss': '0.363', 'grad_norm': '1.084', 'learning_rate': '4.992e-05', 'epoch': '0.2236', 'num_input_tokens_seen': 18177360, 'train_runtime': '9195', 'train_tokens_per_second': '1977'} +{'loss': '0.9959', 'grad_norm': '1.781', 'learning_rate': '4.992e-05', 'epoch': '0.2236', 'num_input_tokens_seen': 18179407, 'train_runtime': '9196', 'train_tokens_per_second': '1977'} +{'loss': '0.2309', 'grad_norm': '0.9463', 'learning_rate': '4.992e-05', 'epoch': '0.2236', 'num_input_tokens_seen': 18181454, 'train_runtime': '9197', 'train_tokens_per_second': '1977'} +{'loss': '0.5231', 'grad_norm': '1.406', 'learning_rate': '4.992e-05', 'epoch': '0.2237', 'num_input_tokens_seen': 18183501, 'train_runtime': '9198', 'train_tokens_per_second': '1977'} +{'loss': '1.48', 'grad_norm': '1.864', 'learning_rate': '4.992e-05', 'epoch': '0.2237', 'num_input_tokens_seen': 18185548, 'train_runtime': '9199', 'train_tokens_per_second': '1977'} +{'loss': '1.476', 'grad_norm': '2.461', 'learning_rate': '4.992e-05', 'epoch': '0.2237', 'num_input_tokens_seen': 18187595, 'train_runtime': '9200', 'train_tokens_per_second': '1977'} +{'loss': '0.9787', 'grad_norm': '1.456', 'learning_rate': '4.992e-05', 'epoch': '0.2237', 'num_input_tokens_seen': 18189642, 'train_runtime': '9201', 'train_tokens_per_second': '1977'} +{'loss': '0.571', 'grad_norm': '1.331', 'learning_rate': '4.992e-05', 'epoch': '0.2238', 'num_input_tokens_seen': 18191689, 'train_runtime': '9203', 'train_tokens_per_second': '1977'} +{'loss': '0.8326', 'grad_norm': '1.269', 'learning_rate': '4.992e-05', 'epoch': '0.2238', 'num_input_tokens_seen': 18193736, 'train_runtime': '9204', 'train_tokens_per_second': '1977'} +{'loss': '0.963', 'grad_norm': '2.023', 'learning_rate': '4.992e-05', 'epoch': '0.2238', 'num_input_tokens_seen': 18195783, 'train_runtime': '9205', 'train_tokens_per_second': '1977'} +{'loss': '0.7075', 'grad_norm': '1.292', 'learning_rate': '4.992e-05', 'epoch': '0.2238', 'num_input_tokens_seen': 18197830, 'train_runtime': '9206', 'train_tokens_per_second': '1977'} +{'loss': '0.7058', 'grad_norm': '1.073', 'learning_rate': '4.992e-05', 'epoch': '0.2239', 'num_input_tokens_seen': 18199877, 'train_runtime': '9207', 'train_tokens_per_second': '1977'} +{'loss': '0.7114', 'grad_norm': '1.42', 'learning_rate': '4.992e-05', 'epoch': '0.2239', 'num_input_tokens_seen': 18201924, 'train_runtime': '9208', 'train_tokens_per_second': '1977'} +{'loss': '0.3996', 'grad_norm': '0.9098', 'learning_rate': '4.992e-05', 'epoch': '0.2239', 'num_input_tokens_seen': 18203971, 'train_runtime': '9209', 'train_tokens_per_second': '1977'} +{'loss': '0.5711', 'grad_norm': '1.156', 'learning_rate': '4.992e-05', 'epoch': '0.2239', 'num_input_tokens_seen': 18206018, 'train_runtime': '9210', 'train_tokens_per_second': '1977'} +{'loss': '0.8312', 'grad_norm': '1.72', 'learning_rate': '4.992e-05', 'epoch': '0.224', 'num_input_tokens_seen': 18208065, 'train_runtime': '9211', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.414', 'learning_rate': '4.992e-05', 'epoch': '0.224', 'num_input_tokens_seen': 18210112, 'train_runtime': '9212', 'train_tokens_per_second': '1977'} +{'loss': '0.969', 'grad_norm': '1.594', 'learning_rate': '4.992e-05', 'epoch': '0.224', 'num_input_tokens_seen': 18212159, 'train_runtime': '9213', 'train_tokens_per_second': '1977'} +{'loss': '0.3979', 'grad_norm': '1.04', 'learning_rate': '4.992e-05', 'epoch': '0.224', 'num_input_tokens_seen': 18214206, 'train_runtime': '9214', 'train_tokens_per_second': '1977'} +{'loss': '0.6003', 'grad_norm': '1.42', 'learning_rate': '4.992e-05', 'epoch': '0.2241', 'num_input_tokens_seen': 18216253, 'train_runtime': '9215', 'train_tokens_per_second': '1977'} +{'loss': '0.2385', 'grad_norm': '1.313', 'learning_rate': '4.992e-05', 'epoch': '0.2241', 'num_input_tokens_seen': 18218300, 'train_runtime': '9216', 'train_tokens_per_second': '1977'} +{'loss': '0.3211', 'grad_norm': '0.8638', 'learning_rate': '4.992e-05', 'epoch': '0.2241', 'num_input_tokens_seen': 18220347, 'train_runtime': '9217', 'train_tokens_per_second': '1977'} +{'loss': '0.9037', 'grad_norm': '1.331', 'learning_rate': '4.992e-05', 'epoch': '0.2241', 'num_input_tokens_seen': 18222394, 'train_runtime': '9218', 'train_tokens_per_second': '1977'} +{'loss': '0.3209', 'grad_norm': '0.9365', 'learning_rate': '4.992e-05', 'epoch': '0.2242', 'num_input_tokens_seen': 18224441, 'train_runtime': '9219', 'train_tokens_per_second': '1977'} +{'loss': '0.8529', 'grad_norm': '1.412', 'learning_rate': '4.992e-05', 'epoch': '0.2242', 'num_input_tokens_seen': 18226488, 'train_runtime': '9220', 'train_tokens_per_second': '1977'} +{'loss': '0.3739', 'grad_norm': '0.8173', 'learning_rate': '4.992e-05', 'epoch': '0.2242', 'num_input_tokens_seen': 18228535, 'train_runtime': '9221', 'train_tokens_per_second': '1977'} +{'loss': '0.2896', 'grad_norm': '0.897', 'learning_rate': '4.992e-05', 'epoch': '0.2242', 'num_input_tokens_seen': 18230582, 'train_runtime': '9222', 'train_tokens_per_second': '1977'} +{'loss': '0.4134', 'grad_norm': '1.083', 'learning_rate': '4.992e-05', 'epoch': '0.2243', 'num_input_tokens_seen': 18232629, 'train_runtime': '9223', 'train_tokens_per_second': '1977'} +{'loss': '0.7338', 'grad_norm': '0.9711', 'learning_rate': '4.992e-05', 'epoch': '0.2243', 'num_input_tokens_seen': 18234676, 'train_runtime': '9224', 'train_tokens_per_second': '1977'} +{'loss': '0.2566', 'grad_norm': '0.9895', 'learning_rate': '4.992e-05', 'epoch': '0.2243', 'num_input_tokens_seen': 18236723, 'train_runtime': '9225', 'train_tokens_per_second': '1977'} +{'loss': '0.1867', 'grad_norm': '0.8627', 'learning_rate': '4.992e-05', 'epoch': '0.2243', 'num_input_tokens_seen': 18238770, 'train_runtime': '9226', 'train_tokens_per_second': '1977'} +{'loss': '0.3035', 'grad_norm': '0.9777', 'learning_rate': '4.992e-05', 'epoch': '0.2244', 'num_input_tokens_seen': 18240817, 'train_runtime': '9227', 'train_tokens_per_second': '1977'} +{'loss': '0.6869', 'grad_norm': '1.365', 'learning_rate': '4.992e-05', 'epoch': '0.2244', 'num_input_tokens_seen': 18242864, 'train_runtime': '9228', 'train_tokens_per_second': '1977'} +{'loss': '0.7175', 'grad_norm': '1.773', 'learning_rate': '4.992e-05', 'epoch': '0.2244', 'num_input_tokens_seen': 18244911, 'train_runtime': '9229', 'train_tokens_per_second': '1977'} +{'loss': '1.506', 'grad_norm': '2.785', 'learning_rate': '4.992e-05', 'epoch': '0.2244', 'num_input_tokens_seen': 18246958, 'train_runtime': '9230', 'train_tokens_per_second': '1977'} +{'loss': '0.6714', 'grad_norm': '1.498', 'learning_rate': '4.992e-05', 'epoch': '0.2245', 'num_input_tokens_seen': 18249005, 'train_runtime': '9231', 'train_tokens_per_second': '1977'} +{'loss': '0.5893', 'grad_norm': '1.591', 'learning_rate': '4.992e-05', 'epoch': '0.2245', 'num_input_tokens_seen': 18251052, 'train_runtime': '9232', 'train_tokens_per_second': '1977'} +{'loss': '0.3125', 'grad_norm': '0.7518', 'learning_rate': '4.992e-05', 'epoch': '0.2245', 'num_input_tokens_seen': 18253099, 'train_runtime': '9234', 'train_tokens_per_second': '1977'} +{'loss': '0.9206', 'grad_norm': '1.474', 'learning_rate': '4.992e-05', 'epoch': '0.2245', 'num_input_tokens_seen': 18255146, 'train_runtime': '9235', 'train_tokens_per_second': '1977'} +{'loss': '2.051', 'grad_norm': '2.812', 'learning_rate': '4.992e-05', 'epoch': '0.2246', 'num_input_tokens_seen': 18257193, 'train_runtime': '9236', 'train_tokens_per_second': '1977'} +{'loss': '0.3094', 'grad_norm': '0.8602', 'learning_rate': '4.992e-05', 'epoch': '0.2246', 'num_input_tokens_seen': 18259240, 'train_runtime': '9237', 'train_tokens_per_second': '1977'} +{'loss': '0.3392', 'grad_norm': '0.9171', 'learning_rate': '4.992e-05', 'epoch': '0.2246', 'num_input_tokens_seen': 18261287, 'train_runtime': '9238', 'train_tokens_per_second': '1977'} +{'loss': '1.765', 'grad_norm': '2.215', 'learning_rate': '4.992e-05', 'epoch': '0.2246', 'num_input_tokens_seen': 18263334, 'train_runtime': '9239', 'train_tokens_per_second': '1977'} +{'loss': '0.453', 'grad_norm': '0.9876', 'learning_rate': '4.992e-05', 'epoch': '0.2247', 'num_input_tokens_seen': 18265381, 'train_runtime': '9240', 'train_tokens_per_second': '1977'} +{'loss': '1.171', 'grad_norm': '1.687', 'learning_rate': '4.992e-05', 'epoch': '0.2247', 'num_input_tokens_seen': 18267428, 'train_runtime': '9241', 'train_tokens_per_second': '1977'} +{'loss': '0.5705', 'grad_norm': '1.145', 'learning_rate': '4.992e-05', 'epoch': '0.2247', 'num_input_tokens_seen': 18269475, 'train_runtime': '9242', 'train_tokens_per_second': '1977'} +{'loss': '0.5911', 'grad_norm': '1.187', 'learning_rate': '4.992e-05', 'epoch': '0.2247', 'num_input_tokens_seen': 18271522, 'train_runtime': '9243', 'train_tokens_per_second': '1977'} +{'loss': '1.787', 'grad_norm': '2.522', 'learning_rate': '4.992e-05', 'epoch': '0.2248', 'num_input_tokens_seen': 18273569, 'train_runtime': '9244', 'train_tokens_per_second': '1977'} +{'loss': '0.5315', 'grad_norm': '0.9999', 'learning_rate': '4.992e-05', 'epoch': '0.2248', 'num_input_tokens_seen': 18275616, 'train_runtime': '9245', 'train_tokens_per_second': '1977'} +{'loss': '1.042', 'grad_norm': '1.729', 'learning_rate': '4.992e-05', 'epoch': '0.2248', 'num_input_tokens_seen': 18277663, 'train_runtime': '9246', 'train_tokens_per_second': '1977'} +{'loss': '0.3798', 'grad_norm': '0.8646', 'learning_rate': '4.992e-05', 'epoch': '0.2248', 'num_input_tokens_seen': 18279710, 'train_runtime': '9247', 'train_tokens_per_second': '1977'} +{'loss': '0.3035', 'grad_norm': '0.8422', 'learning_rate': '4.992e-05', 'epoch': '0.2249', 'num_input_tokens_seen': 18281757, 'train_runtime': '9248', 'train_tokens_per_second': '1977'} +{'loss': '1.249', 'grad_norm': '1.798', 'learning_rate': '4.992e-05', 'epoch': '0.2249', 'num_input_tokens_seen': 18283804, 'train_runtime': '9249', 'train_tokens_per_second': '1977'} +{'loss': '0.9142', 'grad_norm': '1.386', 'learning_rate': '4.992e-05', 'epoch': '0.2249', 'num_input_tokens_seen': 18285851, 'train_runtime': '9250', 'train_tokens_per_second': '1977'} +{'loss': '0.6059', 'grad_norm': '1.334', 'learning_rate': '4.992e-05', 'epoch': '0.2249', 'num_input_tokens_seen': 18287898, 'train_runtime': '9251', 'train_tokens_per_second': '1977'} +{'loss': '0.5538', 'grad_norm': '1.072', 'learning_rate': '4.992e-05', 'epoch': '0.225', 'num_input_tokens_seen': 18289945, 'train_runtime': '9252', 'train_tokens_per_second': '1977'} +{'loss': '0.6363', 'grad_norm': '1.398', 'learning_rate': '4.992e-05', 'epoch': '0.225', 'num_input_tokens_seen': 18291992, 'train_runtime': '9253', 'train_tokens_per_second': '1977'} +{'loss': '0.3811', 'grad_norm': '0.904', 'learning_rate': '4.992e-05', 'epoch': '0.225', 'num_input_tokens_seen': 18294039, 'train_runtime': '9254', 'train_tokens_per_second': '1977'} +{'loss': '1.259', 'grad_norm': '1.988', 'learning_rate': '4.992e-05', 'epoch': '0.225', 'num_input_tokens_seen': 18296086, 'train_runtime': '9255', 'train_tokens_per_second': '1977'} +{'loss': '1.117', 'grad_norm': '1.102', 'learning_rate': '4.992e-05', 'epoch': '0.2251', 'num_input_tokens_seen': 18298133, 'train_runtime': '9256', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '1.865', 'learning_rate': '4.992e-05', 'epoch': '0.2251', 'num_input_tokens_seen': 18300180, 'train_runtime': '9257', 'train_tokens_per_second': '1977'} +{'loss': '0.3786', 'grad_norm': '0.9204', 'learning_rate': '4.992e-05', 'epoch': '0.2251', 'num_input_tokens_seen': 18302227, 'train_runtime': '9258', 'train_tokens_per_second': '1977'} +{'loss': '0.622', 'grad_norm': '1.284', 'learning_rate': '4.992e-05', 'epoch': '0.2251', 'num_input_tokens_seen': 18304274, 'train_runtime': '9259', 'train_tokens_per_second': '1977'} +{'loss': '0.2247', 'grad_norm': '0.8204', 'learning_rate': '4.992e-05', 'epoch': '0.2252', 'num_input_tokens_seen': 18306321, 'train_runtime': '9260', 'train_tokens_per_second': '1977'} +{'loss': '0.2875', 'grad_norm': '0.9312', 'learning_rate': '4.992e-05', 'epoch': '0.2252', 'num_input_tokens_seen': 18308368, 'train_runtime': '9261', 'train_tokens_per_second': '1977'} +{'loss': '0.3844', 'grad_norm': '1.07', 'learning_rate': '4.992e-05', 'epoch': '0.2252', 'num_input_tokens_seen': 18310415, 'train_runtime': '9262', 'train_tokens_per_second': '1977'} +{'loss': '0.3828', 'grad_norm': '0.8719', 'learning_rate': '4.992e-05', 'epoch': '0.2252', 'num_input_tokens_seen': 18312462, 'train_runtime': '9263', 'train_tokens_per_second': '1977'} +{'loss': '0.446', 'grad_norm': '1.063', 'learning_rate': '4.992e-05', 'epoch': '0.2253', 'num_input_tokens_seen': 18314509, 'train_runtime': '9264', 'train_tokens_per_second': '1977'} +{'loss': '1.41', 'grad_norm': '2.097', 'learning_rate': '4.992e-05', 'epoch': '0.2253', 'num_input_tokens_seen': 18316556, 'train_runtime': '9266', 'train_tokens_per_second': '1977'} +{'loss': '1.361', 'grad_norm': '2.685', 'learning_rate': '4.992e-05', 'epoch': '0.2253', 'num_input_tokens_seen': 18318603, 'train_runtime': '9267', 'train_tokens_per_second': '1977'} +{'loss': '1.931', 'grad_norm': '2.392', 'learning_rate': '4.992e-05', 'epoch': '0.2253', 'num_input_tokens_seen': 18320650, 'train_runtime': '9268', 'train_tokens_per_second': '1977'} +{'loss': '1.108', 'grad_norm': '2.088', 'learning_rate': '4.992e-05', 'epoch': '0.2254', 'num_input_tokens_seen': 18322697, 'train_runtime': '9269', 'train_tokens_per_second': '1977'} +{'loss': '0.9128', 'grad_norm': '1.615', 'learning_rate': '4.992e-05', 'epoch': '0.2254', 'num_input_tokens_seen': 18324744, 'train_runtime': '9270', 'train_tokens_per_second': '1977'} +{'loss': '0.3647', 'grad_norm': '1.252', 'learning_rate': '4.992e-05', 'epoch': '0.2254', 'num_input_tokens_seen': 18326791, 'train_runtime': '9271', 'train_tokens_per_second': '1977'} +{'loss': '0.5961', 'grad_norm': '0.886', 'learning_rate': '4.992e-05', 'epoch': '0.2254', 'num_input_tokens_seen': 18328838, 'train_runtime': '9272', 'train_tokens_per_second': '1977'} +{'loss': '0.9402', 'grad_norm': '1.512', 'learning_rate': '4.992e-05', 'epoch': '0.2255', 'num_input_tokens_seen': 18330885, 'train_runtime': '9273', 'train_tokens_per_second': '1977'} +{'loss': '1.214', 'grad_norm': '2.164', 'learning_rate': '4.992e-05', 'epoch': '0.2255', 'num_input_tokens_seen': 18332932, 'train_runtime': '9274', 'train_tokens_per_second': '1977'} +{'loss': '0.3201', 'grad_norm': '0.9627', 'learning_rate': '4.992e-05', 'epoch': '0.2255', 'num_input_tokens_seen': 18334979, 'train_runtime': '9275', 'train_tokens_per_second': '1977'} +{'loss': '0.7034', 'grad_norm': '1.694', 'learning_rate': '4.992e-05', 'epoch': '0.2255', 'num_input_tokens_seen': 18337026, 'train_runtime': '9276', 'train_tokens_per_second': '1977'} +{'loss': '0.66', 'grad_norm': '1.219', 'learning_rate': '4.992e-05', 'epoch': '0.2256', 'num_input_tokens_seen': 18339073, 'train_runtime': '9277', 'train_tokens_per_second': '1977'} +{'loss': '0.3368', 'grad_norm': '0.9341', 'learning_rate': '4.992e-05', 'epoch': '0.2256', 'num_input_tokens_seen': 18341120, 'train_runtime': '9278', 'train_tokens_per_second': '1977'} +{'loss': '0.4971', 'grad_norm': '1.072', 'learning_rate': '4.992e-05', 'epoch': '0.2256', 'num_input_tokens_seen': 18343167, 'train_runtime': '9279', 'train_tokens_per_second': '1977'} +{'loss': '0.8179', 'grad_norm': '1.131', 'learning_rate': '4.992e-05', 'epoch': '0.2256', 'num_input_tokens_seen': 18345214, 'train_runtime': '9280', 'train_tokens_per_second': '1977'} +{'loss': '0.3242', 'grad_norm': '0.8149', 'learning_rate': '4.992e-05', 'epoch': '0.2257', 'num_input_tokens_seen': 18347261, 'train_runtime': '9281', 'train_tokens_per_second': '1977'} +{'loss': '0.9235', 'grad_norm': '1.448', 'learning_rate': '4.992e-05', 'epoch': '0.2257', 'num_input_tokens_seen': 18349308, 'train_runtime': '9282', 'train_tokens_per_second': '1977'} +{'loss': '0.272', 'grad_norm': '0.9516', 'learning_rate': '4.992e-05', 'epoch': '0.2257', 'num_input_tokens_seen': 18351355, 'train_runtime': '9283', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '1.95', 'learning_rate': '4.992e-05', 'epoch': '0.2257', 'num_input_tokens_seen': 18353402, 'train_runtime': '9284', 'train_tokens_per_second': '1977'} +{'loss': '1.401', 'grad_norm': '2.034', 'learning_rate': '4.992e-05', 'epoch': '0.2258', 'num_input_tokens_seen': 18355449, 'train_runtime': '9285', 'train_tokens_per_second': '1977'} +{'loss': '0.569', 'grad_norm': '1.33', 'learning_rate': '4.992e-05', 'epoch': '0.2258', 'num_input_tokens_seen': 18357496, 'train_runtime': '9286', 'train_tokens_per_second': '1977'} +{'loss': '0.8947', 'grad_norm': '1.381', 'learning_rate': '4.992e-05', 'epoch': '0.2258', 'num_input_tokens_seen': 18359543, 'train_runtime': '9287', 'train_tokens_per_second': '1977'} +{'loss': '0.3279', 'grad_norm': '0.8377', 'learning_rate': '4.992e-05', 'epoch': '0.2258', 'num_input_tokens_seen': 18361590, 'train_runtime': '9288', 'train_tokens_per_second': '1977'} +{'loss': '1.256', 'grad_norm': '1.847', 'learning_rate': '4.992e-05', 'epoch': '0.2259', 'num_input_tokens_seen': 18363637, 'train_runtime': '9289', 'train_tokens_per_second': '1977'} +{'loss': '1.429', 'grad_norm': '2.123', 'learning_rate': '4.992e-05', 'epoch': '0.2259', 'num_input_tokens_seen': 18365684, 'train_runtime': '9290', 'train_tokens_per_second': '1977'} +{'loss': '0.6487', 'grad_norm': '1.579', 'learning_rate': '4.992e-05', 'epoch': '0.2259', 'num_input_tokens_seen': 18367731, 'train_runtime': '9291', 'train_tokens_per_second': '1977'} +{'loss': '0.8243', 'grad_norm': '1.672', 'learning_rate': '4.992e-05', 'epoch': '0.2259', 'num_input_tokens_seen': 18369778, 'train_runtime': '9292', 'train_tokens_per_second': '1977'} +{'loss': '1.332', 'grad_norm': '2.078', 'learning_rate': '4.992e-05', 'epoch': '0.226', 'num_input_tokens_seen': 18371825, 'train_runtime': '9293', 'train_tokens_per_second': '1977'} +{'loss': '0.3021', 'grad_norm': '0.8343', 'learning_rate': '4.992e-05', 'epoch': '0.226', 'num_input_tokens_seen': 18373872, 'train_runtime': '9294', 'train_tokens_per_second': '1977'} +{'loss': '0.4739', 'grad_norm': '1.17', 'learning_rate': '4.992e-05', 'epoch': '0.226', 'num_input_tokens_seen': 18375919, 'train_runtime': '9295', 'train_tokens_per_second': '1977'} +{'loss': '1.181', 'grad_norm': '1.449', 'learning_rate': '4.992e-05', 'epoch': '0.226', 'num_input_tokens_seen': 18377966, 'train_runtime': '9297', 'train_tokens_per_second': '1977'} +{'loss': '0.6708', 'grad_norm': '0.933', 'learning_rate': '4.992e-05', 'epoch': '0.2261', 'num_input_tokens_seen': 18380013, 'train_runtime': '9298', 'train_tokens_per_second': '1977'} +{'loss': '0.4533', 'grad_norm': '1.339', 'learning_rate': '4.992e-05', 'epoch': '0.2261', 'num_input_tokens_seen': 18382060, 'train_runtime': '9299', 'train_tokens_per_second': '1977'} +{'loss': '0.3872', 'grad_norm': '0.9766', 'learning_rate': '4.992e-05', 'epoch': '0.2261', 'num_input_tokens_seen': 18384107, 'train_runtime': '9300', 'train_tokens_per_second': '1977'} +{'loss': '0.5772', 'grad_norm': '1.104', 'learning_rate': '4.992e-05', 'epoch': '0.2262', 'num_input_tokens_seen': 18386154, 'train_runtime': '9301', 'train_tokens_per_second': '1977'} +{'loss': '0.852', 'grad_norm': '1.275', 'learning_rate': '4.992e-05', 'epoch': '0.2262', 'num_input_tokens_seen': 18388201, 'train_runtime': '9302', 'train_tokens_per_second': '1977'} +{'loss': '0.4704', 'grad_norm': '1.028', 'learning_rate': '4.992e-05', 'epoch': '0.2262', 'num_input_tokens_seen': 18390248, 'train_runtime': '9303', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.011', 'learning_rate': '4.992e-05', 'epoch': '0.2262', 'num_input_tokens_seen': 18392295, 'train_runtime': '9304', 'train_tokens_per_second': '1977'} +{'loss': '1.553', 'grad_norm': '1.976', 'learning_rate': '4.992e-05', 'epoch': '0.2263', 'num_input_tokens_seen': 18394342, 'train_runtime': '9305', 'train_tokens_per_second': '1977'} +{'loss': '0.3203', 'grad_norm': '0.9733', 'learning_rate': '4.992e-05', 'epoch': '0.2263', 'num_input_tokens_seen': 18396389, 'train_runtime': '9306', 'train_tokens_per_second': '1977'} +{'loss': '1.315', 'grad_norm': '2.077', 'learning_rate': '4.992e-05', 'epoch': '0.2263', 'num_input_tokens_seen': 18398436, 'train_runtime': '9307', 'train_tokens_per_second': '1977'} +{'loss': '1.426', 'grad_norm': '2.196', 'learning_rate': '4.992e-05', 'epoch': '0.2263', 'num_input_tokens_seen': 18400483, 'train_runtime': '9308', 'train_tokens_per_second': '1977'} +{'loss': '0.2029', 'grad_norm': '0.8401', 'learning_rate': '4.992e-05', 'epoch': '0.2264', 'num_input_tokens_seen': 18402530, 'train_runtime': '9309', 'train_tokens_per_second': '1977'} +{'loss': '0.9914', 'grad_norm': '1.526', 'learning_rate': '4.992e-05', 'epoch': '0.2264', 'num_input_tokens_seen': 18404577, 'train_runtime': '9310', 'train_tokens_per_second': '1977'} +{'loss': '0.6548', 'grad_norm': '1.398', 'learning_rate': '4.992e-05', 'epoch': '0.2264', 'num_input_tokens_seen': 18406624, 'train_runtime': '9311', 'train_tokens_per_second': '1977'} +{'loss': '1.17', 'grad_norm': '2.039', 'learning_rate': '4.992e-05', 'epoch': '0.2264', 'num_input_tokens_seen': 18408671, 'train_runtime': '9312', 'train_tokens_per_second': '1977'} +{'loss': '0.4729', 'grad_norm': '1.15', 'learning_rate': '4.992e-05', 'epoch': '0.2265', 'num_input_tokens_seen': 18410718, 'train_runtime': '9313', 'train_tokens_per_second': '1977'} +{'loss': '0.5227', 'grad_norm': '1.131', 'learning_rate': '4.992e-05', 'epoch': '0.2265', 'num_input_tokens_seen': 18412765, 'train_runtime': '9314', 'train_tokens_per_second': '1977'} +{'loss': '0.3896', 'grad_norm': '0.892', 'learning_rate': '4.992e-05', 'epoch': '0.2265', 'num_input_tokens_seen': 18414812, 'train_runtime': '9315', 'train_tokens_per_second': '1977'} +{'loss': '0.5565', 'grad_norm': '1.245', 'learning_rate': '4.992e-05', 'epoch': '0.2265', 'num_input_tokens_seen': 18416859, 'train_runtime': '9316', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '1.604', 'learning_rate': '4.992e-05', 'epoch': '0.2266', 'num_input_tokens_seen': 18418906, 'train_runtime': '9317', 'train_tokens_per_second': '1977'} +{'loss': '1.64', 'grad_norm': '2.062', 'learning_rate': '4.992e-05', 'epoch': '0.2266', 'num_input_tokens_seen': 18420953, 'train_runtime': '9318', 'train_tokens_per_second': '1977'} +{'loss': '0.8945', 'grad_norm': '1.922', 'learning_rate': '4.992e-05', 'epoch': '0.2266', 'num_input_tokens_seen': 18423000, 'train_runtime': '9319', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 05:12:44,737 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 05:12:44,737 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 05:12:45,268 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-9000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 05:12:45,275 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-9000/tokenizer_config.json + +{'loss': '0.7539', 'grad_norm': '1.563', 'learning_rate': '4.992e-05', 'epoch': '0.2266', 'num_input_tokens_seen': 18425047, 'train_runtime': '9321', 'train_tokens_per_second': '1977'} +{'loss': '0.7262', 'grad_norm': '1.217', 'learning_rate': '4.992e-05', 'epoch': '0.2267', 'num_input_tokens_seen': 18427094, 'train_runtime': '9322', 'train_tokens_per_second': '1977'} +{'loss': '0.3825', 'grad_norm': '0.9224', 'learning_rate': '4.992e-05', 'epoch': '0.2267', 'num_input_tokens_seen': 18429141, 'train_runtime': '9323', 'train_tokens_per_second': '1977'} +{'loss': '1.187', 'grad_norm': '1.971', 'learning_rate': '4.992e-05', 'epoch': '0.2267', 'num_input_tokens_seen': 18431188, 'train_runtime': '9324', 'train_tokens_per_second': '1977'} +{'loss': '1.728', 'grad_norm': '2.805', 'learning_rate': '4.992e-05', 'epoch': '0.2267', 'num_input_tokens_seen': 18433235, 'train_runtime': '9325', 'train_tokens_per_second': '1977'} +{'loss': '0.6314', 'grad_norm': '1.13', 'learning_rate': '4.992e-05', 'epoch': '0.2268', 'num_input_tokens_seen': 18435282, 'train_runtime': '9326', 'train_tokens_per_second': '1977'} +{'loss': '0.9989', 'grad_norm': '1.476', 'learning_rate': '4.992e-05', 'epoch': '0.2268', 'num_input_tokens_seen': 18437329, 'train_runtime': '9327', 'train_tokens_per_second': '1977'} +{'loss': '0.9978', 'grad_norm': '1.798', 'learning_rate': '4.992e-05', 'epoch': '0.2268', 'num_input_tokens_seen': 18439376, 'train_runtime': '9328', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '1.42', 'learning_rate': '4.992e-05', 'epoch': '0.2268', 'num_input_tokens_seen': 18441423, 'train_runtime': '9329', 'train_tokens_per_second': '1977'} +{'loss': '0.9314', 'grad_norm': '1.808', 'learning_rate': '4.992e-05', 'epoch': '0.2269', 'num_input_tokens_seen': 18443470, 'train_runtime': '9330', 'train_tokens_per_second': '1977'} +{'loss': '1.391', 'grad_norm': '1.936', 'learning_rate': '4.992e-05', 'epoch': '0.2269', 'num_input_tokens_seen': 18445517, 'train_runtime': '9331', 'train_tokens_per_second': '1977'} +{'loss': '0.4238', 'grad_norm': '1.112', 'learning_rate': '4.992e-05', 'epoch': '0.2269', 'num_input_tokens_seen': 18447564, 'train_runtime': '9332', 'train_tokens_per_second': '1977'} +{'loss': '0.308', 'grad_norm': '0.9858', 'learning_rate': '4.992e-05', 'epoch': '0.2269', 'num_input_tokens_seen': 18449611, 'train_runtime': '9333', 'train_tokens_per_second': '1977'} +{'loss': '0.7803', 'grad_norm': '1.679', 'learning_rate': '4.992e-05', 'epoch': '0.227', 'num_input_tokens_seen': 18451658, 'train_runtime': '9334', 'train_tokens_per_second': '1977'} +{'loss': '0.812', 'grad_norm': '1.623', 'learning_rate': '4.992e-05', 'epoch': '0.227', 'num_input_tokens_seen': 18453705, 'train_runtime': '9335', 'train_tokens_per_second': '1977'} +{'loss': '1.337', 'grad_norm': '1.816', 'learning_rate': '4.992e-05', 'epoch': '0.227', 'num_input_tokens_seen': 18455752, 'train_runtime': '9337', 'train_tokens_per_second': '1977'} +{'loss': '0.4711', 'grad_norm': '1.249', 'learning_rate': '4.992e-05', 'epoch': '0.227', 'num_input_tokens_seen': 18457799, 'train_runtime': '9338', 'train_tokens_per_second': '1977'} +{'loss': '0.3355', 'grad_norm': '1.068', 'learning_rate': '4.992e-05', 'epoch': '0.2271', 'num_input_tokens_seen': 18459846, 'train_runtime': '9339', 'train_tokens_per_second': '1977'} +{'loss': '1.247', 'grad_norm': '1.58', 'learning_rate': '4.992e-05', 'epoch': '0.2271', 'num_input_tokens_seen': 18461893, 'train_runtime': '9340', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '1.731', 'learning_rate': '4.992e-05', 'epoch': '0.2271', 'num_input_tokens_seen': 18463940, 'train_runtime': '9341', 'train_tokens_per_second': '1977'} +{'loss': '0.3222', 'grad_norm': '1.106', 'learning_rate': '4.992e-05', 'epoch': '0.2271', 'num_input_tokens_seen': 18465987, 'train_runtime': '9342', 'train_tokens_per_second': '1977'} +{'loss': '1.072', 'grad_norm': '1.476', 'learning_rate': '4.992e-05', 'epoch': '0.2272', 'num_input_tokens_seen': 18468034, 'train_runtime': '9343', 'train_tokens_per_second': '1977'} +{'loss': '0.9975', 'grad_norm': '1.9', 'learning_rate': '4.992e-05', 'epoch': '0.2272', 'num_input_tokens_seen': 18470081, 'train_runtime': '9344', 'train_tokens_per_second': '1977'} +{'loss': '1.057', 'grad_norm': '2.167', 'learning_rate': '4.992e-05', 'epoch': '0.2272', 'num_input_tokens_seen': 18472128, 'train_runtime': '9345', 'train_tokens_per_second': '1977'} +{'loss': '0.3235', 'grad_norm': '1', 'learning_rate': '4.992e-05', 'epoch': '0.2272', 'num_input_tokens_seen': 18474175, 'train_runtime': '9346', 'train_tokens_per_second': '1977'} +{'loss': '0.3291', 'grad_norm': '1.085', 'learning_rate': '4.992e-05', 'epoch': '0.2273', 'num_input_tokens_seen': 18476222, 'train_runtime': '9347', 'train_tokens_per_second': '1977'} +{'loss': '1.354', 'grad_norm': '1.982', 'learning_rate': '4.992e-05', 'epoch': '0.2273', 'num_input_tokens_seen': 18478269, 'train_runtime': '9348', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.744', 'learning_rate': '4.992e-05', 'epoch': '0.2273', 'num_input_tokens_seen': 18480316, 'train_runtime': '9349', 'train_tokens_per_second': '1977'} +{'loss': '0.3317', 'grad_norm': '0.8178', 'learning_rate': '4.992e-05', 'epoch': '0.2273', 'num_input_tokens_seen': 18482363, 'train_runtime': '9350', 'train_tokens_per_second': '1977'} +{'loss': '0.7314', 'grad_norm': '1.486', 'learning_rate': '4.992e-05', 'epoch': '0.2274', 'num_input_tokens_seen': 18484410, 'train_runtime': '9351', 'train_tokens_per_second': '1977'} +{'loss': '1.215', 'grad_norm': '1.445', 'learning_rate': '4.992e-05', 'epoch': '0.2274', 'num_input_tokens_seen': 18486457, 'train_runtime': '9352', 'train_tokens_per_second': '1977'} +{'loss': '0.8138', 'grad_norm': '1.197', 'learning_rate': '4.992e-05', 'epoch': '0.2274', 'num_input_tokens_seen': 18488504, 'train_runtime': '9353', 'train_tokens_per_second': '1977'} +{'loss': '0.5418', 'grad_norm': '1.297', 'learning_rate': '4.992e-05', 'epoch': '0.2274', 'num_input_tokens_seen': 18490551, 'train_runtime': '9354', 'train_tokens_per_second': '1977'} +{'loss': '0.3544', 'grad_norm': '0.9003', 'learning_rate': '4.992e-05', 'epoch': '0.2275', 'num_input_tokens_seen': 18492598, 'train_runtime': '9355', 'train_tokens_per_second': '1977'} +{'loss': '1.596', 'grad_norm': '2.291', 'learning_rate': '4.992e-05', 'epoch': '0.2275', 'num_input_tokens_seen': 18494645, 'train_runtime': '9356', 'train_tokens_per_second': '1977'} +{'loss': '0.3514', 'grad_norm': '0.8445', 'learning_rate': '4.992e-05', 'epoch': '0.2275', 'num_input_tokens_seen': 18496692, 'train_runtime': '9357', 'train_tokens_per_second': '1977'} +{'loss': '0.704', 'grad_norm': '1.201', 'learning_rate': '4.992e-05', 'epoch': '0.2275', 'num_input_tokens_seen': 18498739, 'train_runtime': '9358', 'train_tokens_per_second': '1977'} +{'loss': '0.3041', 'grad_norm': '0.9734', 'learning_rate': '4.992e-05', 'epoch': '0.2276', 'num_input_tokens_seen': 18500786, 'train_runtime': '9359', 'train_tokens_per_second': '1977'} +{'loss': '1.147', 'grad_norm': '1.611', 'learning_rate': '4.992e-05', 'epoch': '0.2276', 'num_input_tokens_seen': 18502833, 'train_runtime': '9360', 'train_tokens_per_second': '1977'} +{'loss': '0.4101', 'grad_norm': '1.054', 'learning_rate': '4.992e-05', 'epoch': '0.2276', 'num_input_tokens_seen': 18504880, 'train_runtime': '9361', 'train_tokens_per_second': '1977'} +{'loss': '0.4023', 'grad_norm': '1.054', 'learning_rate': '4.992e-05', 'epoch': '0.2276', 'num_input_tokens_seen': 18506927, 'train_runtime': '9362', 'train_tokens_per_second': '1977'} +{'loss': '0.3304', 'grad_norm': '0.9185', 'learning_rate': '4.992e-05', 'epoch': '0.2277', 'num_input_tokens_seen': 18508974, 'train_runtime': '9363', 'train_tokens_per_second': '1977'} +{'loss': '0.6401', 'grad_norm': '1.212', 'learning_rate': '4.992e-05', 'epoch': '0.2277', 'num_input_tokens_seen': 18511021, 'train_runtime': '9364', 'train_tokens_per_second': '1977'} +{'loss': '0.969', 'grad_norm': '1.478', 'learning_rate': '4.992e-05', 'epoch': '0.2277', 'num_input_tokens_seen': 18513068, 'train_runtime': '9365', 'train_tokens_per_second': '1977'} +{'loss': '0.552', 'grad_norm': '1.123', 'learning_rate': '4.992e-05', 'epoch': '0.2277', 'num_input_tokens_seen': 18515115, 'train_runtime': '9366', 'train_tokens_per_second': '1977'} +{'loss': '0.7828', 'grad_norm': '1.789', 'learning_rate': '4.992e-05', 'epoch': '0.2278', 'num_input_tokens_seen': 18517162, 'train_runtime': '9367', 'train_tokens_per_second': '1977'} +{'loss': '0.4897', 'grad_norm': '1.254', 'learning_rate': '4.992e-05', 'epoch': '0.2278', 'num_input_tokens_seen': 18519209, 'train_runtime': '9369', 'train_tokens_per_second': '1977'} +{'loss': '0.4422', 'grad_norm': '1.118', 'learning_rate': '4.992e-05', 'epoch': '0.2278', 'num_input_tokens_seen': 18521256, 'train_runtime': '9370', 'train_tokens_per_second': '1977'} +{'loss': '0.5769', 'grad_norm': '1.114', 'learning_rate': '4.992e-05', 'epoch': '0.2278', 'num_input_tokens_seen': 18523303, 'train_runtime': '9371', 'train_tokens_per_second': '1977'} +{'loss': '0.7555', 'grad_norm': '1.711', 'learning_rate': '4.992e-05', 'epoch': '0.2279', 'num_input_tokens_seen': 18525350, 'train_runtime': '9372', 'train_tokens_per_second': '1977'} +{'loss': '0.3474', 'grad_norm': '1.172', 'learning_rate': '4.992e-05', 'epoch': '0.2279', 'num_input_tokens_seen': 18527397, 'train_runtime': '9373', 'train_tokens_per_second': '1977'} +{'loss': '0.976', 'grad_norm': '1.578', 'learning_rate': '4.992e-05', 'epoch': '0.2279', 'num_input_tokens_seen': 18529444, 'train_runtime': '9374', 'train_tokens_per_second': '1977'} +{'loss': '0.5214', 'grad_norm': '1.247', 'learning_rate': '4.992e-05', 'epoch': '0.2279', 'num_input_tokens_seen': 18531491, 'train_runtime': '9375', 'train_tokens_per_second': '1977'} +{'loss': '0.2274', 'grad_norm': '0.9265', 'learning_rate': '4.992e-05', 'epoch': '0.228', 'num_input_tokens_seen': 18533538, 'train_runtime': '9376', 'train_tokens_per_second': '1977'} +{'loss': '0.9843', 'grad_norm': '1.454', 'learning_rate': '4.992e-05', 'epoch': '0.228', 'num_input_tokens_seen': 18535585, 'train_runtime': '9377', 'train_tokens_per_second': '1977'} +{'loss': '0.7273', 'grad_norm': '1.637', 'learning_rate': '4.992e-05', 'epoch': '0.228', 'num_input_tokens_seen': 18537632, 'train_runtime': '9378', 'train_tokens_per_second': '1977'} +{'loss': '1.088', 'grad_norm': '2.469', 'learning_rate': '4.992e-05', 'epoch': '0.228', 'num_input_tokens_seen': 18539679, 'train_runtime': '9379', 'train_tokens_per_second': '1977'} +{'loss': '0.7357', 'grad_norm': '1.629', 'learning_rate': '4.992e-05', 'epoch': '0.2281', 'num_input_tokens_seen': 18541726, 'train_runtime': '9380', 'train_tokens_per_second': '1977'} +{'loss': '0.9902', 'grad_norm': '1.976', 'learning_rate': '4.992e-05', 'epoch': '0.2281', 'num_input_tokens_seen': 18543773, 'train_runtime': '9381', 'train_tokens_per_second': '1977'} +{'loss': '1.607', 'grad_norm': '2.803', 'learning_rate': '4.992e-05', 'epoch': '0.2281', 'num_input_tokens_seen': 18545820, 'train_runtime': '9382', 'train_tokens_per_second': '1977'} +{'loss': '0.3449', 'grad_norm': '0.9917', 'learning_rate': '4.992e-05', 'epoch': '0.2281', 'num_input_tokens_seen': 18547867, 'train_runtime': '9383', 'train_tokens_per_second': '1977'} +{'loss': '0.8488', 'grad_norm': '1.442', 'learning_rate': '4.992e-05', 'epoch': '0.2282', 'num_input_tokens_seen': 18549914, 'train_runtime': '9384', 'train_tokens_per_second': '1977'} +{'loss': '0.7014', 'grad_norm': '1.259', 'learning_rate': '4.992e-05', 'epoch': '0.2282', 'num_input_tokens_seen': 18551961, 'train_runtime': '9385', 'train_tokens_per_second': '1977'} +{'loss': '1.467', 'grad_norm': '2.726', 'learning_rate': '4.992e-05', 'epoch': '0.2282', 'num_input_tokens_seen': 18554008, 'train_runtime': '9386', 'train_tokens_per_second': '1977'} +{'loss': '0.9332', 'grad_norm': '1.382', 'learning_rate': '4.992e-05', 'epoch': '0.2282', 'num_input_tokens_seen': 18556055, 'train_runtime': '9387', 'train_tokens_per_second': '1977'} +{'loss': '1.569', 'grad_norm': '2.186', 'learning_rate': '4.992e-05', 'epoch': '0.2283', 'num_input_tokens_seen': 18558102, 'train_runtime': '9388', 'train_tokens_per_second': '1977'} +{'loss': '1.553', 'grad_norm': '2.698', 'learning_rate': '4.992e-05', 'epoch': '0.2283', 'num_input_tokens_seen': 18560149, 'train_runtime': '9389', 'train_tokens_per_second': '1977'} +{'loss': '0.3439', 'grad_norm': '1.075', 'learning_rate': '4.992e-05', 'epoch': '0.2283', 'num_input_tokens_seen': 18562196, 'train_runtime': '9390', 'train_tokens_per_second': '1977'} +{'loss': '0.9839', 'grad_norm': '1.445', 'learning_rate': '4.992e-05', 'epoch': '0.2283', 'num_input_tokens_seen': 18564243, 'train_runtime': '9391', 'train_tokens_per_second': '1977'} +{'loss': '0.6102', 'grad_norm': '0.9439', 'learning_rate': '4.992e-05', 'epoch': '0.2284', 'num_input_tokens_seen': 18566290, 'train_runtime': '9392', 'train_tokens_per_second': '1977'} +{'loss': '0.5567', 'grad_norm': '1.307', 'learning_rate': '4.992e-05', 'epoch': '0.2284', 'num_input_tokens_seen': 18568337, 'train_runtime': '9393', 'train_tokens_per_second': '1977'} +{'loss': '0.6787', 'grad_norm': '1.166', 'learning_rate': '4.992e-05', 'epoch': '0.2284', 'num_input_tokens_seen': 18570384, 'train_runtime': '9394', 'train_tokens_per_second': '1977'} +{'loss': '0.2532', 'grad_norm': '0.8537', 'learning_rate': '4.992e-05', 'epoch': '0.2284', 'num_input_tokens_seen': 18572431, 'train_runtime': '9395', 'train_tokens_per_second': '1977'} +{'loss': '0.525', 'grad_norm': '1.271', 'learning_rate': '4.992e-05', 'epoch': '0.2285', 'num_input_tokens_seen': 18574478, 'train_runtime': '9396', 'train_tokens_per_second': '1977'} +{'loss': '0.3193', 'grad_norm': '0.9742', 'learning_rate': '4.992e-05', 'epoch': '0.2285', 'num_input_tokens_seen': 18576525, 'train_runtime': '9397', 'train_tokens_per_second': '1977'} +{'loss': '1.415', 'grad_norm': '1.656', 'learning_rate': '4.992e-05', 'epoch': '0.2285', 'num_input_tokens_seen': 18578572, 'train_runtime': '9398', 'train_tokens_per_second': '1977'} +{'loss': '0.721', 'grad_norm': '1.08', 'learning_rate': '4.992e-05', 'epoch': '0.2285', 'num_input_tokens_seen': 18580619, 'train_runtime': '9400', 'train_tokens_per_second': '1977'} +{'loss': '0.6649', 'grad_norm': '1.381', 'learning_rate': '4.992e-05', 'epoch': '0.2286', 'num_input_tokens_seen': 18582666, 'train_runtime': '9401', 'train_tokens_per_second': '1977'} +{'loss': '1.132', 'grad_norm': '1.572', 'learning_rate': '4.992e-05', 'epoch': '0.2286', 'num_input_tokens_seen': 18584713, 'train_runtime': '9402', 'train_tokens_per_second': '1977'} +{'loss': '1.75', 'grad_norm': '2.362', 'learning_rate': '4.992e-05', 'epoch': '0.2286', 'num_input_tokens_seen': 18586760, 'train_runtime': '9403', 'train_tokens_per_second': '1977'} +{'loss': '0.722', 'grad_norm': '1.384', 'learning_rate': '4.992e-05', 'epoch': '0.2286', 'num_input_tokens_seen': 18588807, 'train_runtime': '9404', 'train_tokens_per_second': '1977'} +{'loss': '0.5127', 'grad_norm': '0.9733', 'learning_rate': '4.992e-05', 'epoch': '0.2287', 'num_input_tokens_seen': 18590854, 'train_runtime': '9405', 'train_tokens_per_second': '1977'} +{'loss': '0.2527', 'grad_norm': '0.9797', 'learning_rate': '4.991e-05', 'epoch': '0.2287', 'num_input_tokens_seen': 18592901, 'train_runtime': '9406', 'train_tokens_per_second': '1977'} +{'loss': '0.3836', 'grad_norm': '0.8321', 'learning_rate': '4.991e-05', 'epoch': '0.2287', 'num_input_tokens_seen': 18594948, 'train_runtime': '9407', 'train_tokens_per_second': '1977'} +{'loss': '0.8738', 'grad_norm': '1.293', 'learning_rate': '4.991e-05', 'epoch': '0.2287', 'num_input_tokens_seen': 18596995, 'train_runtime': '9408', 'train_tokens_per_second': '1977'} +{'loss': '0.9543', 'grad_norm': '1.797', 'learning_rate': '4.991e-05', 'epoch': '0.2288', 'num_input_tokens_seen': 18599042, 'train_runtime': '9409', 'train_tokens_per_second': '1977'} +{'loss': '1.013', 'grad_norm': '2.027', 'learning_rate': '4.991e-05', 'epoch': '0.2288', 'num_input_tokens_seen': 18601089, 'train_runtime': '9410', 'train_tokens_per_second': '1977'} +{'loss': '0.7527', 'grad_norm': '1.425', 'learning_rate': '4.991e-05', 'epoch': '0.2288', 'num_input_tokens_seen': 18603136, 'train_runtime': '9411', 'train_tokens_per_second': '1977'} +{'loss': '0.5341', 'grad_norm': '1.257', 'learning_rate': '4.991e-05', 'epoch': '0.2288', 'num_input_tokens_seen': 18605183, 'train_runtime': '9412', 'train_tokens_per_second': '1977'} +{'loss': '0.2895', 'grad_norm': '0.9286', 'learning_rate': '4.991e-05', 'epoch': '0.2289', 'num_input_tokens_seen': 18607230, 'train_runtime': '9413', 'train_tokens_per_second': '1977'} +{'loss': '0.7867', 'grad_norm': '1.377', 'learning_rate': '4.991e-05', 'epoch': '0.2289', 'num_input_tokens_seen': 18609277, 'train_runtime': '9414', 'train_tokens_per_second': '1977'} +{'loss': '0.5121', 'grad_norm': '1.081', 'learning_rate': '4.991e-05', 'epoch': '0.2289', 'num_input_tokens_seen': 18611324, 'train_runtime': '9415', 'train_tokens_per_second': '1977'} +{'loss': '1.967', 'grad_norm': '2.68', 'learning_rate': '4.991e-05', 'epoch': '0.2289', 'num_input_tokens_seen': 18613371, 'train_runtime': '9416', 'train_tokens_per_second': '1977'} +{'loss': '1.572', 'grad_norm': '2.79', 'learning_rate': '4.991e-05', 'epoch': '0.229', 'num_input_tokens_seen': 18615418, 'train_runtime': '9417', 'train_tokens_per_second': '1977'} +{'loss': '0.474', 'grad_norm': '1.132', 'learning_rate': '4.991e-05', 'epoch': '0.229', 'num_input_tokens_seen': 18617465, 'train_runtime': '9418', 'train_tokens_per_second': '1977'} +{'loss': '0.5058', 'grad_norm': '1.095', 'learning_rate': '4.991e-05', 'epoch': '0.229', 'num_input_tokens_seen': 18619512, 'train_runtime': '9419', 'train_tokens_per_second': '1977'} +{'loss': '0.7322', 'grad_norm': '1.115', 'learning_rate': '4.991e-05', 'epoch': '0.229', 'num_input_tokens_seen': 18621559, 'train_runtime': '9420', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '1.589', 'learning_rate': '4.991e-05', 'epoch': '0.2291', 'num_input_tokens_seen': 18623606, 'train_runtime': '9421', 'train_tokens_per_second': '1977'} +{'loss': '1.208', 'grad_norm': '1.742', 'learning_rate': '4.991e-05', 'epoch': '0.2291', 'num_input_tokens_seen': 18625653, 'train_runtime': '9422', 'train_tokens_per_second': '1977'} +{'loss': '0.353', 'grad_norm': '1.247', 'learning_rate': '4.991e-05', 'epoch': '0.2291', 'num_input_tokens_seen': 18627700, 'train_runtime': '9423', 'train_tokens_per_second': '1977'} +{'loss': '0.482', 'grad_norm': '1.077', 'learning_rate': '4.991e-05', 'epoch': '0.2291', 'num_input_tokens_seen': 18629747, 'train_runtime': '9424', 'train_tokens_per_second': '1977'} +{'loss': '0.6726', 'grad_norm': '1.158', 'learning_rate': '4.991e-05', 'epoch': '0.2292', 'num_input_tokens_seen': 18631794, 'train_runtime': '9425', 'train_tokens_per_second': '1977'} +{'loss': '0.8081', 'grad_norm': '1.348', 'learning_rate': '4.991e-05', 'epoch': '0.2292', 'num_input_tokens_seen': 18633841, 'train_runtime': '9426', 'train_tokens_per_second': '1977'} +{'loss': '0.6176', 'grad_norm': '1.139', 'learning_rate': '4.991e-05', 'epoch': '0.2292', 'num_input_tokens_seen': 18635888, 'train_runtime': '9427', 'train_tokens_per_second': '1977'} +{'loss': '1.909', 'grad_norm': '2.632', 'learning_rate': '4.991e-05', 'epoch': '0.2292', 'num_input_tokens_seen': 18637935, 'train_runtime': '9428', 'train_tokens_per_second': '1977'} +{'loss': '1.749', 'grad_norm': '2.626', 'learning_rate': '4.991e-05', 'epoch': '0.2293', 'num_input_tokens_seen': 18639982, 'train_runtime': '9429', 'train_tokens_per_second': '1977'} +{'loss': '0.9019', 'grad_norm': '1.304', 'learning_rate': '4.991e-05', 'epoch': '0.2293', 'num_input_tokens_seen': 18642029, 'train_runtime': '9430', 'train_tokens_per_second': '1977'} +{'loss': '0.8844', 'grad_norm': '1.602', 'learning_rate': '4.991e-05', 'epoch': '0.2293', 'num_input_tokens_seen': 18644076, 'train_runtime': '9432', 'train_tokens_per_second': '1977'} +{'loss': '0.4367', 'grad_norm': '0.9162', 'learning_rate': '4.991e-05', 'epoch': '0.2293', 'num_input_tokens_seen': 18646123, 'train_runtime': '9433', 'train_tokens_per_second': '1977'} +{'loss': '0.6448', 'grad_norm': '1.295', 'learning_rate': '4.991e-05', 'epoch': '0.2294', 'num_input_tokens_seen': 18648170, 'train_runtime': '9434', 'train_tokens_per_second': '1977'} +{'loss': '0.8836', 'grad_norm': '1.234', 'learning_rate': '4.991e-05', 'epoch': '0.2294', 'num_input_tokens_seen': 18650217, 'train_runtime': '9435', 'train_tokens_per_second': '1977'} +{'loss': '0.2434', 'grad_norm': '0.8792', 'learning_rate': '4.991e-05', 'epoch': '0.2294', 'num_input_tokens_seen': 18652264, 'train_runtime': '9436', 'train_tokens_per_second': '1977'} +{'loss': '0.8045', 'grad_norm': '1.263', 'learning_rate': '4.991e-05', 'epoch': '0.2294', 'num_input_tokens_seen': 18654311, 'train_runtime': '9437', 'train_tokens_per_second': '1977'} +{'loss': '0.3673', 'grad_norm': '1.021', 'learning_rate': '4.991e-05', 'epoch': '0.2295', 'num_input_tokens_seen': 18656358, 'train_runtime': '9438', 'train_tokens_per_second': '1977'} +{'loss': '0.5708', 'grad_norm': '1.382', 'learning_rate': '4.991e-05', 'epoch': '0.2295', 'num_input_tokens_seen': 18658405, 'train_runtime': '9439', 'train_tokens_per_second': '1977'} +{'loss': '1.723', 'grad_norm': '2.377', 'learning_rate': '4.991e-05', 'epoch': '0.2295', 'num_input_tokens_seen': 18660452, 'train_runtime': '9440', 'train_tokens_per_second': '1977'} +{'loss': '1.114', 'grad_norm': '1.456', 'learning_rate': '4.991e-05', 'epoch': '0.2295', 'num_input_tokens_seen': 18662499, 'train_runtime': '9441', 'train_tokens_per_second': '1977'} +{'loss': '0.6394', 'grad_norm': '1.303', 'learning_rate': '4.991e-05', 'epoch': '0.2296', 'num_input_tokens_seen': 18664546, 'train_runtime': '9442', 'train_tokens_per_second': '1977'} +{'loss': '0.7517', 'grad_norm': '1.31', 'learning_rate': '4.991e-05', 'epoch': '0.2296', 'num_input_tokens_seen': 18666593, 'train_runtime': '9443', 'train_tokens_per_second': '1977'} +{'loss': '1.216', 'grad_norm': '2.351', 'learning_rate': '4.991e-05', 'epoch': '0.2296', 'num_input_tokens_seen': 18668640, 'train_runtime': '9444', 'train_tokens_per_second': '1977'} +{'loss': '0.3552', 'grad_norm': '0.9912', 'learning_rate': '4.991e-05', 'epoch': '0.2296', 'num_input_tokens_seen': 18670687, 'train_runtime': '9445', 'train_tokens_per_second': '1977'} +{'loss': '0.6818', 'grad_norm': '1.21', 'learning_rate': '4.991e-05', 'epoch': '0.2297', 'num_input_tokens_seen': 18672734, 'train_runtime': '9446', 'train_tokens_per_second': '1977'} +{'loss': '0.494', 'grad_norm': '1.106', 'learning_rate': '4.991e-05', 'epoch': '0.2297', 'num_input_tokens_seen': 18674781, 'train_runtime': '9447', 'train_tokens_per_second': '1977'} +{'loss': '0.4692', 'grad_norm': '1.055', 'learning_rate': '4.991e-05', 'epoch': '0.2297', 'num_input_tokens_seen': 18676828, 'train_runtime': '9448', 'train_tokens_per_second': '1977'} +{'loss': '1.773', 'grad_norm': '2.82', 'learning_rate': '4.991e-05', 'epoch': '0.2298', 'num_input_tokens_seen': 18678875, 'train_runtime': '9449', 'train_tokens_per_second': '1977'} +{'loss': '1.3', 'grad_norm': '1.513', 'learning_rate': '4.991e-05', 'epoch': '0.2298', 'num_input_tokens_seen': 18680922, 'train_runtime': '9450', 'train_tokens_per_second': '1977'} +{'loss': '1.416', 'grad_norm': '2.12', 'learning_rate': '4.991e-05', 'epoch': '0.2298', 'num_input_tokens_seen': 18682969, 'train_runtime': '9451', 'train_tokens_per_second': '1977'} +{'loss': '0.5534', 'grad_norm': '1.222', 'learning_rate': '4.991e-05', 'epoch': '0.2298', 'num_input_tokens_seen': 18685016, 'train_runtime': '9452', 'train_tokens_per_second': '1977'} +{'loss': '0.558', 'grad_norm': '1.09', 'learning_rate': '4.991e-05', 'epoch': '0.2299', 'num_input_tokens_seen': 18687063, 'train_runtime': '9453', 'train_tokens_per_second': '1977'} +{'loss': '0.793', 'grad_norm': '1.256', 'learning_rate': '4.991e-05', 'epoch': '0.2299', 'num_input_tokens_seen': 18689110, 'train_runtime': '9454', 'train_tokens_per_second': '1977'} +{'loss': '0.5841', 'grad_norm': '1.258', 'learning_rate': '4.991e-05', 'epoch': '0.2299', 'num_input_tokens_seen': 18691157, 'train_runtime': '9455', 'train_tokens_per_second': '1977'} +{'loss': '0.2399', 'grad_norm': '0.9138', 'learning_rate': '4.991e-05', 'epoch': '0.2299', 'num_input_tokens_seen': 18693204, 'train_runtime': '9456', 'train_tokens_per_second': '1977'} +{'loss': '0.6839', 'grad_norm': '1.311', 'learning_rate': '4.991e-05', 'epoch': '0.23', 'num_input_tokens_seen': 18695251, 'train_runtime': '9457', 'train_tokens_per_second': '1977'} +{'loss': '0.6074', 'grad_norm': '1.016', 'learning_rate': '4.991e-05', 'epoch': '0.23', 'num_input_tokens_seen': 18697298, 'train_runtime': '9458', 'train_tokens_per_second': '1977'} +{'loss': '0.601', 'grad_norm': '1.592', 'learning_rate': '4.991e-05', 'epoch': '0.23', 'num_input_tokens_seen': 18699345, 'train_runtime': '9459', 'train_tokens_per_second': '1977'} +{'loss': '1.278', 'grad_norm': '2.097', 'learning_rate': '4.991e-05', 'epoch': '0.23', 'num_input_tokens_seen': 18701392, 'train_runtime': '9460', 'train_tokens_per_second': '1977'} +{'loss': '0.3412', 'grad_norm': '0.9534', 'learning_rate': '4.991e-05', 'epoch': '0.2301', 'num_input_tokens_seen': 18703439, 'train_runtime': '9461', 'train_tokens_per_second': '1977'} +{'loss': '0.431', 'grad_norm': '1.169', 'learning_rate': '4.991e-05', 'epoch': '0.2301', 'num_input_tokens_seen': 18705486, 'train_runtime': '9463', 'train_tokens_per_second': '1977'} +{'loss': '1.202', 'grad_norm': '1.56', 'learning_rate': '4.991e-05', 'epoch': '0.2301', 'num_input_tokens_seen': 18707533, 'train_runtime': '9464', 'train_tokens_per_second': '1977'} +{'loss': '0.619', 'grad_norm': '1.215', 'learning_rate': '4.991e-05', 'epoch': '0.2301', 'num_input_tokens_seen': 18709580, 'train_runtime': '9465', 'train_tokens_per_second': '1977'} +{'loss': '0.7054', 'grad_norm': '1.068', 'learning_rate': '4.991e-05', 'epoch': '0.2302', 'num_input_tokens_seen': 18711627, 'train_runtime': '9466', 'train_tokens_per_second': '1977'} +{'loss': '0.3564', 'grad_norm': '0.9312', 'learning_rate': '4.991e-05', 'epoch': '0.2302', 'num_input_tokens_seen': 18713674, 'train_runtime': '9467', 'train_tokens_per_second': '1977'} +{'loss': '0.3866', 'grad_norm': '1.195', 'learning_rate': '4.991e-05', 'epoch': '0.2302', 'num_input_tokens_seen': 18715721, 'train_runtime': '9468', 'train_tokens_per_second': '1977'} +{'loss': '0.4556', 'grad_norm': '0.9298', 'learning_rate': '4.991e-05', 'epoch': '0.2302', 'num_input_tokens_seen': 18717768, 'train_runtime': '9469', 'train_tokens_per_second': '1977'} +{'loss': '0.6548', 'grad_norm': '1.494', 'learning_rate': '4.991e-05', 'epoch': '0.2303', 'num_input_tokens_seen': 18719815, 'train_runtime': '9470', 'train_tokens_per_second': '1977'} +{'loss': '0.3539', 'grad_norm': '0.926', 'learning_rate': '4.991e-05', 'epoch': '0.2303', 'num_input_tokens_seen': 18721862, 'train_runtime': '9471', 'train_tokens_per_second': '1977'} +{'loss': '0.8067', 'grad_norm': '1.402', 'learning_rate': '4.991e-05', 'epoch': '0.2303', 'num_input_tokens_seen': 18723909, 'train_runtime': '9472', 'train_tokens_per_second': '1977'} +{'loss': '0.4685', 'grad_norm': '1.234', 'learning_rate': '4.991e-05', 'epoch': '0.2303', 'num_input_tokens_seen': 18725956, 'train_runtime': '9473', 'train_tokens_per_second': '1977'} +{'loss': '0.5746', 'grad_norm': '1.268', 'learning_rate': '4.991e-05', 'epoch': '0.2304', 'num_input_tokens_seen': 18728003, 'train_runtime': '9474', 'train_tokens_per_second': '1977'} +{'loss': '0.6947', 'grad_norm': '1.427', 'learning_rate': '4.991e-05', 'epoch': '0.2304', 'num_input_tokens_seen': 18730050, 'train_runtime': '9475', 'train_tokens_per_second': '1977'} +{'loss': '0.5426', 'grad_norm': '1.106', 'learning_rate': '4.991e-05', 'epoch': '0.2304', 'num_input_tokens_seen': 18732097, 'train_runtime': '9476', 'train_tokens_per_second': '1977'} +{'loss': '0.5315', 'grad_norm': '1.132', 'learning_rate': '4.991e-05', 'epoch': '0.2304', 'num_input_tokens_seen': 18734144, 'train_runtime': '9477', 'train_tokens_per_second': '1977'} +{'loss': '0.4527', 'grad_norm': '1.074', 'learning_rate': '4.991e-05', 'epoch': '0.2305', 'num_input_tokens_seen': 18736191, 'train_runtime': '9478', 'train_tokens_per_second': '1977'} +{'loss': '0.9033', 'grad_norm': '1.524', 'learning_rate': '4.991e-05', 'epoch': '0.2305', 'num_input_tokens_seen': 18738238, 'train_runtime': '9479', 'train_tokens_per_second': '1977'} +{'loss': '1.23', 'grad_norm': '2.182', 'learning_rate': '4.991e-05', 'epoch': '0.2305', 'num_input_tokens_seen': 18740285, 'train_runtime': '9480', 'train_tokens_per_second': '1977'} +{'loss': '0.5372', 'grad_norm': '1.296', 'learning_rate': '4.991e-05', 'epoch': '0.2305', 'num_input_tokens_seen': 18742332, 'train_runtime': '9481', 'train_tokens_per_second': '1977'} +{'loss': '0.8681', 'grad_norm': '1.47', 'learning_rate': '4.991e-05', 'epoch': '0.2306', 'num_input_tokens_seen': 18744379, 'train_runtime': '9482', 'train_tokens_per_second': '1977'} +{'loss': '0.8411', 'grad_norm': '1.003', 'learning_rate': '4.991e-05', 'epoch': '0.2306', 'num_input_tokens_seen': 18746426, 'train_runtime': '9483', 'train_tokens_per_second': '1977'} +{'loss': '0.5057', 'grad_norm': '1.285', 'learning_rate': '4.991e-05', 'epoch': '0.2306', 'num_input_tokens_seen': 18748473, 'train_runtime': '9484', 'train_tokens_per_second': '1977'} +{'loss': '1.317', 'grad_norm': '1.951', 'learning_rate': '4.991e-05', 'epoch': '0.2306', 'num_input_tokens_seen': 18750520, 'train_runtime': '9485', 'train_tokens_per_second': '1977'} +{'loss': '0.7086', 'grad_norm': '1.3', 'learning_rate': '4.991e-05', 'epoch': '0.2307', 'num_input_tokens_seen': 18752567, 'train_runtime': '9486', 'train_tokens_per_second': '1977'} +{'loss': '0.4732', 'grad_norm': '1.216', 'learning_rate': '4.991e-05', 'epoch': '0.2307', 'num_input_tokens_seen': 18754614, 'train_runtime': '9487', 'train_tokens_per_second': '1977'} +{'loss': '0.6173', 'grad_norm': '1.093', 'learning_rate': '4.991e-05', 'epoch': '0.2307', 'num_input_tokens_seen': 18756661, 'train_runtime': '9488', 'train_tokens_per_second': '1977'} +{'loss': '0.9115', 'grad_norm': '1.412', 'learning_rate': '4.991e-05', 'epoch': '0.2307', 'num_input_tokens_seen': 18758708, 'train_runtime': '9489', 'train_tokens_per_second': '1977'} +{'loss': '0.3017', 'grad_norm': '0.8799', 'learning_rate': '4.991e-05', 'epoch': '0.2308', 'num_input_tokens_seen': 18760755, 'train_runtime': '9490', 'train_tokens_per_second': '1977'} +{'loss': '1.875', 'grad_norm': '2.255', 'learning_rate': '4.991e-05', 'epoch': '0.2308', 'num_input_tokens_seen': 18762802, 'train_runtime': '9491', 'train_tokens_per_second': '1977'} +{'loss': '1.585', 'grad_norm': '2.34', 'learning_rate': '4.991e-05', 'epoch': '0.2308', 'num_input_tokens_seen': 18764849, 'train_runtime': '9493', 'train_tokens_per_second': '1977'} +{'loss': '1.089', 'grad_norm': '1.69', 'learning_rate': '4.991e-05', 'epoch': '0.2308', 'num_input_tokens_seen': 18766896, 'train_runtime': '9494', 'train_tokens_per_second': '1977'} +{'loss': '0.9007', 'grad_norm': '1.186', 'learning_rate': '4.991e-05', 'epoch': '0.2309', 'num_input_tokens_seen': 18768943, 'train_runtime': '9495', 'train_tokens_per_second': '1977'} +{'loss': '0.4095', 'grad_norm': '1.026', 'learning_rate': '4.991e-05', 'epoch': '0.2309', 'num_input_tokens_seen': 18770990, 'train_runtime': '9496', 'train_tokens_per_second': '1977'} +{'loss': '0.5353', 'grad_norm': '0.943', 'learning_rate': '4.991e-05', 'epoch': '0.2309', 'num_input_tokens_seen': 18773037, 'train_runtime': '9497', 'train_tokens_per_second': '1977'} +{'loss': '0.8963', 'grad_norm': '1.422', 'learning_rate': '4.991e-05', 'epoch': '0.2309', 'num_input_tokens_seen': 18775084, 'train_runtime': '9498', 'train_tokens_per_second': '1977'} +{'loss': '0.3728', 'grad_norm': '1.039', 'learning_rate': '4.991e-05', 'epoch': '0.231', 'num_input_tokens_seen': 18777131, 'train_runtime': '9499', 'train_tokens_per_second': '1977'} +{'loss': '0.4987', 'grad_norm': '1.11', 'learning_rate': '4.991e-05', 'epoch': '0.231', 'num_input_tokens_seen': 18779178, 'train_runtime': '9500', 'train_tokens_per_second': '1977'} +{'loss': '0.8438', 'grad_norm': '1.055', 'learning_rate': '4.991e-05', 'epoch': '0.231', 'num_input_tokens_seen': 18781225, 'train_runtime': '9501', 'train_tokens_per_second': '1977'} +{'loss': '0.3175', 'grad_norm': '0.8611', 'learning_rate': '4.991e-05', 'epoch': '0.231', 'num_input_tokens_seen': 18783272, 'train_runtime': '9502', 'train_tokens_per_second': '1977'} +{'loss': '0.4247', 'grad_norm': '0.9175', 'learning_rate': '4.991e-05', 'epoch': '0.2311', 'num_input_tokens_seen': 18785319, 'train_runtime': '9503', 'train_tokens_per_second': '1977'} +{'loss': '0.5731', 'grad_norm': '1.171', 'learning_rate': '4.991e-05', 'epoch': '0.2311', 'num_input_tokens_seen': 18787366, 'train_runtime': '9504', 'train_tokens_per_second': '1977'} +{'loss': '1.948', 'grad_norm': '2.577', 'learning_rate': '4.991e-05', 'epoch': '0.2311', 'num_input_tokens_seen': 18789413, 'train_runtime': '9505', 'train_tokens_per_second': '1977'} +{'loss': '0.6398', 'grad_norm': '1.412', 'learning_rate': '4.991e-05', 'epoch': '0.2311', 'num_input_tokens_seen': 18791460, 'train_runtime': '9506', 'train_tokens_per_second': '1977'} +{'loss': '0.2834', 'grad_norm': '0.9315', 'learning_rate': '4.991e-05', 'epoch': '0.2312', 'num_input_tokens_seen': 18793507, 'train_runtime': '9507', 'train_tokens_per_second': '1977'} +{'loss': '0.505', 'grad_norm': '1.226', 'learning_rate': '4.991e-05', 'epoch': '0.2312', 'num_input_tokens_seen': 18795554, 'train_runtime': '9508', 'train_tokens_per_second': '1977'} +{'loss': '0.685', 'grad_norm': '1.418', 'learning_rate': '4.991e-05', 'epoch': '0.2312', 'num_input_tokens_seen': 18797601, 'train_runtime': '9509', 'train_tokens_per_second': '1977'} +{'loss': '0.7911', 'grad_norm': '1.275', 'learning_rate': '4.991e-05', 'epoch': '0.2312', 'num_input_tokens_seen': 18799648, 'train_runtime': '9510', 'train_tokens_per_second': '1977'} +{'loss': '0.7544', 'grad_norm': '1.413', 'learning_rate': '4.991e-05', 'epoch': '0.2313', 'num_input_tokens_seen': 18801695, 'train_runtime': '9511', 'train_tokens_per_second': '1977'} +{'loss': '0.1771', 'grad_norm': '0.7775', 'learning_rate': '4.991e-05', 'epoch': '0.2313', 'num_input_tokens_seen': 18803742, 'train_runtime': '9512', 'train_tokens_per_second': '1977'} +{'loss': '0.3557', 'grad_norm': '1.02', 'learning_rate': '4.991e-05', 'epoch': '0.2313', 'num_input_tokens_seen': 18805789, 'train_runtime': '9513', 'train_tokens_per_second': '1977'} +{'loss': '0.2826', 'grad_norm': '0.8811', 'learning_rate': '4.991e-05', 'epoch': '0.2313', 'num_input_tokens_seen': 18807836, 'train_runtime': '9514', 'train_tokens_per_second': '1977'} +{'loss': '0.3605', 'grad_norm': '0.8997', 'learning_rate': '4.991e-05', 'epoch': '0.2314', 'num_input_tokens_seen': 18809883, 'train_runtime': '9515', 'train_tokens_per_second': '1977'} +{'loss': '0.238', 'grad_norm': '0.984', 'learning_rate': '4.991e-05', 'epoch': '0.2314', 'num_input_tokens_seen': 18811930, 'train_runtime': '9516', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.101', 'learning_rate': '4.991e-05', 'epoch': '0.2314', 'num_input_tokens_seen': 18813977, 'train_runtime': '9517', 'train_tokens_per_second': '1977'} +{'loss': '1.15', 'grad_norm': '1.967', 'learning_rate': '4.991e-05', 'epoch': '0.2314', 'num_input_tokens_seen': 18816024, 'train_runtime': '9518', 'train_tokens_per_second': '1977'} +{'loss': '0.3527', 'grad_norm': '1.033', 'learning_rate': '4.991e-05', 'epoch': '0.2315', 'num_input_tokens_seen': 18818071, 'train_runtime': '9519', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.451', 'learning_rate': '4.991e-05', 'epoch': '0.2315', 'num_input_tokens_seen': 18820118, 'train_runtime': '9520', 'train_tokens_per_second': '1977'} +{'loss': '0.5644', 'grad_norm': '1.15', 'learning_rate': '4.991e-05', 'epoch': '0.2315', 'num_input_tokens_seen': 18822165, 'train_runtime': '9521', 'train_tokens_per_second': '1977'} +{'loss': '0.7168', 'grad_norm': '1.271', 'learning_rate': '4.991e-05', 'epoch': '0.2315', 'num_input_tokens_seen': 18824212, 'train_runtime': '9522', 'train_tokens_per_second': '1977'} +{'loss': '0.5732', 'grad_norm': '1.321', 'learning_rate': '4.991e-05', 'epoch': '0.2316', 'num_input_tokens_seen': 18826259, 'train_runtime': '9524', 'train_tokens_per_second': '1977'} +{'loss': '0.8737', 'grad_norm': '1.352', 'learning_rate': '4.991e-05', 'epoch': '0.2316', 'num_input_tokens_seen': 18828306, 'train_runtime': '9525', 'train_tokens_per_second': '1977'} +{'loss': '0.3254', 'grad_norm': '0.9134', 'learning_rate': '4.991e-05', 'epoch': '0.2316', 'num_input_tokens_seen': 18830353, 'train_runtime': '9526', 'train_tokens_per_second': '1977'} +{'loss': '1.83', 'grad_norm': '2.479', 'learning_rate': '4.991e-05', 'epoch': '0.2316', 'num_input_tokens_seen': 18832400, 'train_runtime': '9527', 'train_tokens_per_second': '1977'} +{'loss': '0.9436', 'grad_norm': '1.918', 'learning_rate': '4.991e-05', 'epoch': '0.2317', 'num_input_tokens_seen': 18834447, 'train_runtime': '9528', 'train_tokens_per_second': '1977'} +{'loss': '0.4287', 'grad_norm': '1.048', 'learning_rate': '4.991e-05', 'epoch': '0.2317', 'num_input_tokens_seen': 18836494, 'train_runtime': '9529', 'train_tokens_per_second': '1977'} +{'loss': '0.4224', 'grad_norm': '1.075', 'learning_rate': '4.991e-05', 'epoch': '0.2317', 'num_input_tokens_seen': 18838541, 'train_runtime': '9530', 'train_tokens_per_second': '1977'} +{'loss': '0.4526', 'grad_norm': '1.07', 'learning_rate': '4.991e-05', 'epoch': '0.2317', 'num_input_tokens_seen': 18840588, 'train_runtime': '9531', 'train_tokens_per_second': '1977'} +{'loss': '0.5793', 'grad_norm': '1.173', 'learning_rate': '4.991e-05', 'epoch': '0.2318', 'num_input_tokens_seen': 18842635, 'train_runtime': '9532', 'train_tokens_per_second': '1977'} +{'loss': '0.8023', 'grad_norm': '1.873', 'learning_rate': '4.991e-05', 'epoch': '0.2318', 'num_input_tokens_seen': 18844682, 'train_runtime': '9533', 'train_tokens_per_second': '1977'} +{'loss': '1.086', 'grad_norm': '2.073', 'learning_rate': '4.991e-05', 'epoch': '0.2318', 'num_input_tokens_seen': 18846729, 'train_runtime': '9534', 'train_tokens_per_second': '1977'} +{'loss': '0.4401', 'grad_norm': '0.8772', 'learning_rate': '4.991e-05', 'epoch': '0.2318', 'num_input_tokens_seen': 18848776, 'train_runtime': '9535', 'train_tokens_per_second': '1977'} +{'loss': '1.062', 'grad_norm': '1.402', 'learning_rate': '4.991e-05', 'epoch': '0.2319', 'num_input_tokens_seen': 18850823, 'train_runtime': '9536', 'train_tokens_per_second': '1977'} +{'loss': '0.5878', 'grad_norm': '1.203', 'learning_rate': '4.991e-05', 'epoch': '0.2319', 'num_input_tokens_seen': 18852870, 'train_runtime': '9537', 'train_tokens_per_second': '1977'} +{'loss': '0.3533', 'grad_norm': '0.9489', 'learning_rate': '4.991e-05', 'epoch': '0.2319', 'num_input_tokens_seen': 18854917, 'train_runtime': '9538', 'train_tokens_per_second': '1977'} +{'loss': '0.544', 'grad_norm': '1.036', 'learning_rate': '4.991e-05', 'epoch': '0.2319', 'num_input_tokens_seen': 18856964, 'train_runtime': '9539', 'train_tokens_per_second': '1977'} +{'loss': '0.6207', 'grad_norm': '0.9046', 'learning_rate': '4.991e-05', 'epoch': '0.232', 'num_input_tokens_seen': 18859011, 'train_runtime': '9540', 'train_tokens_per_second': '1977'} +{'loss': '0.1963', 'grad_norm': '0.8281', 'learning_rate': '4.991e-05', 'epoch': '0.232', 'num_input_tokens_seen': 18861058, 'train_runtime': '9541', 'train_tokens_per_second': '1977'} +{'loss': '0.806', 'grad_norm': '1.173', 'learning_rate': '4.991e-05', 'epoch': '0.232', 'num_input_tokens_seen': 18863105, 'train_runtime': '9542', 'train_tokens_per_second': '1977'} +{'loss': '0.7958', 'grad_norm': '1.169', 'learning_rate': '4.991e-05', 'epoch': '0.232', 'num_input_tokens_seen': 18865152, 'train_runtime': '9543', 'train_tokens_per_second': '1977'} +{'loss': '0.8358', 'grad_norm': '1.276', 'learning_rate': '4.991e-05', 'epoch': '0.2321', 'num_input_tokens_seen': 18867199, 'train_runtime': '9544', 'train_tokens_per_second': '1977'} +{'loss': '0.6828', 'grad_norm': '1.523', 'learning_rate': '4.991e-05', 'epoch': '0.2321', 'num_input_tokens_seen': 18869246, 'train_runtime': '9545', 'train_tokens_per_second': '1977'} +{'loss': '1.101', 'grad_norm': '2.105', 'learning_rate': '4.991e-05', 'epoch': '0.2321', 'num_input_tokens_seen': 18871293, 'train_runtime': '9546', 'train_tokens_per_second': '1977'} +{'loss': '1.007', 'grad_norm': '1.604', 'learning_rate': '4.991e-05', 'epoch': '0.2321', 'num_input_tokens_seen': 18873340, 'train_runtime': '9547', 'train_tokens_per_second': '1977'} +{'loss': '0.5972', 'grad_norm': '1.449', 'learning_rate': '4.991e-05', 'epoch': '0.2322', 'num_input_tokens_seen': 18875387, 'train_runtime': '9548', 'train_tokens_per_second': '1977'} +{'loss': '0.9032', 'grad_norm': '1.301', 'learning_rate': '4.991e-05', 'epoch': '0.2322', 'num_input_tokens_seen': 18877434, 'train_runtime': '9549', 'train_tokens_per_second': '1977'} +{'loss': '0.3934', 'grad_norm': '1.032', 'learning_rate': '4.991e-05', 'epoch': '0.2322', 'num_input_tokens_seen': 18879481, 'train_runtime': '9550', 'train_tokens_per_second': '1977'} +{'loss': '0.7819', 'grad_norm': '2.039', 'learning_rate': '4.991e-05', 'epoch': '0.2322', 'num_input_tokens_seen': 18881528, 'train_runtime': '9552', 'train_tokens_per_second': '1977'} +{'loss': '0.9314', 'grad_norm': '1.793', 'learning_rate': '4.991e-05', 'epoch': '0.2323', 'num_input_tokens_seen': 18883575, 'train_runtime': '9553', 'train_tokens_per_second': '1977'} +{'loss': '1.087', 'grad_norm': '2.085', 'learning_rate': '4.991e-05', 'epoch': '0.2323', 'num_input_tokens_seen': 18885622, 'train_runtime': '9554', 'train_tokens_per_second': '1977'} +{'loss': '0.5854', 'grad_norm': '1.125', 'learning_rate': '4.991e-05', 'epoch': '0.2323', 'num_input_tokens_seen': 18887669, 'train_runtime': '9555', 'train_tokens_per_second': '1977'} +{'loss': '1.229', 'grad_norm': '2.009', 'learning_rate': '4.991e-05', 'epoch': '0.2323', 'num_input_tokens_seen': 18889716, 'train_runtime': '9556', 'train_tokens_per_second': '1977'} +{'loss': '1.935', 'grad_norm': '2.497', 'learning_rate': '4.991e-05', 'epoch': '0.2324', 'num_input_tokens_seen': 18891763, 'train_runtime': '9557', 'train_tokens_per_second': '1977'} +{'loss': '1.246', 'grad_norm': '1.957', 'learning_rate': '4.991e-05', 'epoch': '0.2324', 'num_input_tokens_seen': 18893810, 'train_runtime': '9558', 'train_tokens_per_second': '1977'} +{'loss': '1.033', 'grad_norm': '1.36', 'learning_rate': '4.991e-05', 'epoch': '0.2324', 'num_input_tokens_seen': 18895857, 'train_runtime': '9559', 'train_tokens_per_second': '1977'} +{'loss': '0.2827', 'grad_norm': '0.9909', 'learning_rate': '4.991e-05', 'epoch': '0.2324', 'num_input_tokens_seen': 18897904, 'train_runtime': '9560', 'train_tokens_per_second': '1977'} +{'loss': '1.4', 'grad_norm': '2.375', 'learning_rate': '4.991e-05', 'epoch': '0.2325', 'num_input_tokens_seen': 18899951, 'train_runtime': '9561', 'train_tokens_per_second': '1977'} +{'loss': '0.8686', 'grad_norm': '1.416', 'learning_rate': '4.991e-05', 'epoch': '0.2325', 'num_input_tokens_seen': 18901998, 'train_runtime': '9562', 'train_tokens_per_second': '1977'} +{'loss': '0.3647', 'grad_norm': '0.7484', 'learning_rate': '4.991e-05', 'epoch': '0.2325', 'num_input_tokens_seen': 18904045, 'train_runtime': '9563', 'train_tokens_per_second': '1977'} +{'loss': '0.7097', 'grad_norm': '1.387', 'learning_rate': '4.991e-05', 'epoch': '0.2325', 'num_input_tokens_seen': 18906092, 'train_runtime': '9564', 'train_tokens_per_second': '1977'} +{'loss': '0.2623', 'grad_norm': '0.8953', 'learning_rate': '4.991e-05', 'epoch': '0.2326', 'num_input_tokens_seen': 18908139, 'train_runtime': '9565', 'train_tokens_per_second': '1977'} +{'loss': '2.238', 'grad_norm': '2.923', 'learning_rate': '4.991e-05', 'epoch': '0.2326', 'num_input_tokens_seen': 18910186, 'train_runtime': '9566', 'train_tokens_per_second': '1977'} +{'loss': '0.3629', 'grad_norm': '0.9827', 'learning_rate': '4.991e-05', 'epoch': '0.2326', 'num_input_tokens_seen': 18912233, 'train_runtime': '9567', 'train_tokens_per_second': '1977'} +{'loss': '0.3501', 'grad_norm': '0.8377', 'learning_rate': '4.991e-05', 'epoch': '0.2326', 'num_input_tokens_seen': 18914280, 'train_runtime': '9568', 'train_tokens_per_second': '1977'} +{'loss': '0.2207', 'grad_norm': '0.881', 'learning_rate': '4.991e-05', 'epoch': '0.2327', 'num_input_tokens_seen': 18916327, 'train_runtime': '9569', 'train_tokens_per_second': '1977'} +{'loss': '0.6347', 'grad_norm': '1.188', 'learning_rate': '4.991e-05', 'epoch': '0.2327', 'num_input_tokens_seen': 18918374, 'train_runtime': '9570', 'train_tokens_per_second': '1977'} +{'loss': '0.7935', 'grad_norm': '1.303', 'learning_rate': '4.991e-05', 'epoch': '0.2327', 'num_input_tokens_seen': 18920421, 'train_runtime': '9571', 'train_tokens_per_second': '1977'} +{'loss': '1.184', 'grad_norm': '1.816', 'learning_rate': '4.991e-05', 'epoch': '0.2327', 'num_input_tokens_seen': 18922468, 'train_runtime': '9572', 'train_tokens_per_second': '1977'} +{'loss': '1.083', 'grad_norm': '1.565', 'learning_rate': '4.991e-05', 'epoch': '0.2328', 'num_input_tokens_seen': 18924515, 'train_runtime': '9573', 'train_tokens_per_second': '1977'} +{'loss': '0.9282', 'grad_norm': '1.738', 'learning_rate': '4.991e-05', 'epoch': '0.2328', 'num_input_tokens_seen': 18926562, 'train_runtime': '9574', 'train_tokens_per_second': '1977'} +{'loss': '0.2879', 'grad_norm': '1.018', 'learning_rate': '4.991e-05', 'epoch': '0.2328', 'num_input_tokens_seen': 18928609, 'train_runtime': '9575', 'train_tokens_per_second': '1977'} +{'loss': '1.367', 'grad_norm': '2.067', 'learning_rate': '4.991e-05', 'epoch': '0.2328', 'num_input_tokens_seen': 18930656, 'train_runtime': '9576', 'train_tokens_per_second': '1977'} +{'loss': '1.075', 'grad_norm': '2.462', 'learning_rate': '4.991e-05', 'epoch': '0.2329', 'num_input_tokens_seen': 18932703, 'train_runtime': '9577', 'train_tokens_per_second': '1977'} +{'loss': '0.8364', 'grad_norm': '1.427', 'learning_rate': '4.991e-05', 'epoch': '0.2329', 'num_input_tokens_seen': 18934750, 'train_runtime': '9578', 'train_tokens_per_second': '1977'} +{'loss': '0.5186', 'grad_norm': '1.064', 'learning_rate': '4.991e-05', 'epoch': '0.2329', 'num_input_tokens_seen': 18936797, 'train_runtime': '9579', 'train_tokens_per_second': '1977'} +{'loss': '0.4636', 'grad_norm': '1.299', 'learning_rate': '4.991e-05', 'epoch': '0.2329', 'num_input_tokens_seen': 18938844, 'train_runtime': '9581', 'train_tokens_per_second': '1977'} +{'loss': '0.5577', 'grad_norm': '1.05', 'learning_rate': '4.991e-05', 'epoch': '0.233', 'num_input_tokens_seen': 18940891, 'train_runtime': '9582', 'train_tokens_per_second': '1977'} +{'loss': '0.2303', 'grad_norm': '0.8493', 'learning_rate': '4.991e-05', 'epoch': '0.233', 'num_input_tokens_seen': 18942938, 'train_runtime': '9583', 'train_tokens_per_second': '1977'} +{'loss': '0.4829', 'grad_norm': '1.051', 'learning_rate': '4.991e-05', 'epoch': '0.233', 'num_input_tokens_seen': 18944985, 'train_runtime': '9584', 'train_tokens_per_second': '1977'} +{'loss': '0.6423', 'grad_norm': '1.279', 'learning_rate': '4.991e-05', 'epoch': '0.233', 'num_input_tokens_seen': 18947032, 'train_runtime': '9585', 'train_tokens_per_second': '1977'} +{'loss': '0.8043', 'grad_norm': '1.255', 'learning_rate': '4.991e-05', 'epoch': '0.2331', 'num_input_tokens_seen': 18949079, 'train_runtime': '9586', 'train_tokens_per_second': '1977'} +{'loss': '0.8594', 'grad_norm': '1.776', 'learning_rate': '4.991e-05', 'epoch': '0.2331', 'num_input_tokens_seen': 18951126, 'train_runtime': '9587', 'train_tokens_per_second': '1977'} +{'loss': '0.698', 'grad_norm': '1.004', 'learning_rate': '4.991e-05', 'epoch': '0.2331', 'num_input_tokens_seen': 18953173, 'train_runtime': '9588', 'train_tokens_per_second': '1977'} +{'loss': '1.929', 'grad_norm': '2.256', 'learning_rate': '4.991e-05', 'epoch': '0.2331', 'num_input_tokens_seen': 18955220, 'train_runtime': '9589', 'train_tokens_per_second': '1977'} +{'loss': '1.056', 'grad_norm': '1.787', 'learning_rate': '4.991e-05', 'epoch': '0.2332', 'num_input_tokens_seen': 18957267, 'train_runtime': '9590', 'train_tokens_per_second': '1977'} +{'loss': '2.344', 'grad_norm': '2.721', 'learning_rate': '4.991e-05', 'epoch': '0.2332', 'num_input_tokens_seen': 18959314, 'train_runtime': '9591', 'train_tokens_per_second': '1977'} +{'loss': '0.5038', 'grad_norm': '1.031', 'learning_rate': '4.991e-05', 'epoch': '0.2332', 'num_input_tokens_seen': 18961361, 'train_runtime': '9592', 'train_tokens_per_second': '1977'} +{'loss': '0.7816', 'grad_norm': '1.267', 'learning_rate': '4.991e-05', 'epoch': '0.2333', 'num_input_tokens_seen': 18963408, 'train_runtime': '9593', 'train_tokens_per_second': '1977'} +{'loss': '0.7274', 'grad_norm': '1.572', 'learning_rate': '4.991e-05', 'epoch': '0.2333', 'num_input_tokens_seen': 18965455, 'train_runtime': '9594', 'train_tokens_per_second': '1977'} +{'loss': '0.9132', 'grad_norm': '1.412', 'learning_rate': '4.991e-05', 'epoch': '0.2333', 'num_input_tokens_seen': 18967502, 'train_runtime': '9595', 'train_tokens_per_second': '1977'} +{'loss': '0.9462', 'grad_norm': '1.157', 'learning_rate': '4.991e-05', 'epoch': '0.2333', 'num_input_tokens_seen': 18969549, 'train_runtime': '9596', 'train_tokens_per_second': '1977'} +{'loss': '0.3105', 'grad_norm': '1.024', 'learning_rate': '4.991e-05', 'epoch': '0.2334', 'num_input_tokens_seen': 18971596, 'train_runtime': '9597', 'train_tokens_per_second': '1977'} +{'loss': '1.249', 'grad_norm': '1.901', 'learning_rate': '4.991e-05', 'epoch': '0.2334', 'num_input_tokens_seen': 18973643, 'train_runtime': '9598', 'train_tokens_per_second': '1977'} +{'loss': '0.9326', 'grad_norm': '1.423', 'learning_rate': '4.991e-05', 'epoch': '0.2334', 'num_input_tokens_seen': 18975690, 'train_runtime': '9599', 'train_tokens_per_second': '1977'} +{'loss': '0.4586', 'grad_norm': '1.147', 'learning_rate': '4.991e-05', 'epoch': '0.2334', 'num_input_tokens_seen': 18977737, 'train_runtime': '9600', 'train_tokens_per_second': '1977'} +{'loss': '0.4582', 'grad_norm': '1.061', 'learning_rate': '4.991e-05', 'epoch': '0.2335', 'num_input_tokens_seen': 18979784, 'train_runtime': '9601', 'train_tokens_per_second': '1977'} +{'loss': '0.8301', 'grad_norm': '1.183', 'learning_rate': '4.991e-05', 'epoch': '0.2335', 'num_input_tokens_seen': 18981831, 'train_runtime': '9602', 'train_tokens_per_second': '1977'} +{'loss': '1.595', 'grad_norm': '1.999', 'learning_rate': '4.991e-05', 'epoch': '0.2335', 'num_input_tokens_seen': 18983878, 'train_runtime': '9603', 'train_tokens_per_second': '1977'} +{'loss': '0.3502', 'grad_norm': '1.068', 'learning_rate': '4.991e-05', 'epoch': '0.2335', 'num_input_tokens_seen': 18985925, 'train_runtime': '9604', 'train_tokens_per_second': '1977'} +{'loss': '1.173', 'grad_norm': '1.983', 'learning_rate': '4.991e-05', 'epoch': '0.2336', 'num_input_tokens_seen': 18987972, 'train_runtime': '9605', 'train_tokens_per_second': '1977'} +{'loss': '0.3244', 'grad_norm': '0.8156', 'learning_rate': '4.991e-05', 'epoch': '0.2336', 'num_input_tokens_seen': 18990019, 'train_runtime': '9606', 'train_tokens_per_second': '1977'} +{'loss': '0.8507', 'grad_norm': '1.325', 'learning_rate': '4.991e-05', 'epoch': '0.2336', 'num_input_tokens_seen': 18992066, 'train_runtime': '9607', 'train_tokens_per_second': '1977'} +{'loss': '0.3886', 'grad_norm': '0.8798', 'learning_rate': '4.991e-05', 'epoch': '0.2336', 'num_input_tokens_seen': 18994113, 'train_runtime': '9608', 'train_tokens_per_second': '1977'} +{'loss': '1.172', 'grad_norm': '2.036', 'learning_rate': '4.991e-05', 'epoch': '0.2337', 'num_input_tokens_seen': 18996160, 'train_runtime': '9610', 'train_tokens_per_second': '1977'} +{'loss': '0.4553', 'grad_norm': '1.528', 'learning_rate': '4.991e-05', 'epoch': '0.2337', 'num_input_tokens_seen': 18998207, 'train_runtime': '9611', 'train_tokens_per_second': '1977'} +{'loss': '0.5435', 'grad_norm': '1.046', 'learning_rate': '4.991e-05', 'epoch': '0.2337', 'num_input_tokens_seen': 19000254, 'train_runtime': '9612', 'train_tokens_per_second': '1977'} +{'loss': '0.9962', 'grad_norm': '1.619', 'learning_rate': '4.991e-05', 'epoch': '0.2337', 'num_input_tokens_seen': 19002301, 'train_runtime': '9613', 'train_tokens_per_second': '1977'} +{'loss': '0.6866', 'grad_norm': '1.268', 'learning_rate': '4.991e-05', 'epoch': '0.2338', 'num_input_tokens_seen': 19004348, 'train_runtime': '9614', 'train_tokens_per_second': '1977'} +{'loss': '0.2349', 'grad_norm': '0.7995', 'learning_rate': '4.991e-05', 'epoch': '0.2338', 'num_input_tokens_seen': 19006395, 'train_runtime': '9615', 'train_tokens_per_second': '1977'} +{'loss': '0.891', 'grad_norm': '1.497', 'learning_rate': '4.991e-05', 'epoch': '0.2338', 'num_input_tokens_seen': 19008442, 'train_runtime': '9616', 'train_tokens_per_second': '1977'} +{'loss': '0.2459', 'grad_norm': '0.9073', 'learning_rate': '4.991e-05', 'epoch': '0.2338', 'num_input_tokens_seen': 19010489, 'train_runtime': '9617', 'train_tokens_per_second': '1977'} +{'loss': '0.556', 'grad_norm': '1.088', 'learning_rate': '4.991e-05', 'epoch': '0.2339', 'num_input_tokens_seen': 19012536, 'train_runtime': '9618', 'train_tokens_per_second': '1977'} +{'loss': '0.6526', 'grad_norm': '1.54', 'learning_rate': '4.991e-05', 'epoch': '0.2339', 'num_input_tokens_seen': 19014583, 'train_runtime': '9619', 'train_tokens_per_second': '1977'} +{'loss': '0.3624', 'grad_norm': '0.9636', 'learning_rate': '4.991e-05', 'epoch': '0.2339', 'num_input_tokens_seen': 19016630, 'train_runtime': '9620', 'train_tokens_per_second': '1977'} +{'loss': '0.4371', 'grad_norm': '0.8802', 'learning_rate': '4.991e-05', 'epoch': '0.2339', 'num_input_tokens_seen': 19018677, 'train_runtime': '9621', 'train_tokens_per_second': '1977'} +{'loss': '0.6046', 'grad_norm': '0.8874', 'learning_rate': '4.991e-05', 'epoch': '0.234', 'num_input_tokens_seen': 19020724, 'train_runtime': '9622', 'train_tokens_per_second': '1977'} +{'loss': '0.3567', 'grad_norm': '0.9546', 'learning_rate': '4.991e-05', 'epoch': '0.234', 'num_input_tokens_seen': 19022771, 'train_runtime': '9623', 'train_tokens_per_second': '1977'} +{'loss': '0.6585', 'grad_norm': '1.36', 'learning_rate': '4.991e-05', 'epoch': '0.234', 'num_input_tokens_seen': 19024818, 'train_runtime': '9624', 'train_tokens_per_second': '1977'} +{'loss': '0.7362', 'grad_norm': '1.53', 'learning_rate': '4.991e-05', 'epoch': '0.234', 'num_input_tokens_seen': 19026865, 'train_runtime': '9625', 'train_tokens_per_second': '1977'} +{'loss': '1.559', 'grad_norm': '2.244', 'learning_rate': '4.991e-05', 'epoch': '0.2341', 'num_input_tokens_seen': 19028912, 'train_runtime': '9626', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.619', 'learning_rate': '4.991e-05', 'epoch': '0.2341', 'num_input_tokens_seen': 19030959, 'train_runtime': '9627', 'train_tokens_per_second': '1977'} +{'loss': '0.4689', 'grad_norm': '1.178', 'learning_rate': '4.991e-05', 'epoch': '0.2341', 'num_input_tokens_seen': 19033006, 'train_runtime': '9628', 'train_tokens_per_second': '1977'} +{'loss': '0.5838', 'grad_norm': '1.411', 'learning_rate': '4.991e-05', 'epoch': '0.2341', 'num_input_tokens_seen': 19035053, 'train_runtime': '9629', 'train_tokens_per_second': '1977'} +{'loss': '1.464', 'grad_norm': '2.786', 'learning_rate': '4.991e-05', 'epoch': '0.2342', 'num_input_tokens_seen': 19037100, 'train_runtime': '9630', 'train_tokens_per_second': '1977'} +{'loss': '0.7782', 'grad_norm': '1.364', 'learning_rate': '4.991e-05', 'epoch': '0.2342', 'num_input_tokens_seen': 19039147, 'train_runtime': '9631', 'train_tokens_per_second': '1977'} +{'loss': '0.552', 'grad_norm': '1.541', 'learning_rate': '4.991e-05', 'epoch': '0.2342', 'num_input_tokens_seen': 19041194, 'train_runtime': '9632', 'train_tokens_per_second': '1977'} +{'loss': '0.5422', 'grad_norm': '1.036', 'learning_rate': '4.991e-05', 'epoch': '0.2342', 'num_input_tokens_seen': 19043241, 'train_runtime': '9633', 'train_tokens_per_second': '1977'} +{'loss': '1.299', 'grad_norm': '2.107', 'learning_rate': '4.991e-05', 'epoch': '0.2343', 'num_input_tokens_seen': 19045288, 'train_runtime': '9634', 'train_tokens_per_second': '1977'} +{'loss': '1.018', 'grad_norm': '2.052', 'learning_rate': '4.991e-05', 'epoch': '0.2343', 'num_input_tokens_seen': 19047335, 'train_runtime': '9635', 'train_tokens_per_second': '1977'} +{'loss': '0.3187', 'grad_norm': '0.9173', 'learning_rate': '4.991e-05', 'epoch': '0.2343', 'num_input_tokens_seen': 19049382, 'train_runtime': '9636', 'train_tokens_per_second': '1977'} +{'loss': '0.9809', 'grad_norm': '1.463', 'learning_rate': '4.991e-05', 'epoch': '0.2343', 'num_input_tokens_seen': 19051429, 'train_runtime': '9637', 'train_tokens_per_second': '1977'} +{'loss': '0.3832', 'grad_norm': '0.9301', 'learning_rate': '4.991e-05', 'epoch': '0.2344', 'num_input_tokens_seen': 19053476, 'train_runtime': '9639', 'train_tokens_per_second': '1977'} +{'loss': '0.3134', 'grad_norm': '1.137', 'learning_rate': '4.991e-05', 'epoch': '0.2344', 'num_input_tokens_seen': 19055523, 'train_runtime': '9640', 'train_tokens_per_second': '1977'} +{'loss': '0.8067', 'grad_norm': '1.57', 'learning_rate': '4.991e-05', 'epoch': '0.2344', 'num_input_tokens_seen': 19057570, 'train_runtime': '9641', 'train_tokens_per_second': '1977'} +{'loss': '0.3693', 'grad_norm': '0.9459', 'learning_rate': '4.991e-05', 'epoch': '0.2344', 'num_input_tokens_seen': 19059617, 'train_runtime': '9642', 'train_tokens_per_second': '1977'} +{'loss': '0.7427', 'grad_norm': '1.158', 'learning_rate': '4.991e-05', 'epoch': '0.2345', 'num_input_tokens_seen': 19061664, 'train_runtime': '9643', 'train_tokens_per_second': '1977'} +{'loss': '0.408', 'grad_norm': '0.9214', 'learning_rate': '4.991e-05', 'epoch': '0.2345', 'num_input_tokens_seen': 19063711, 'train_runtime': '9644', 'train_tokens_per_second': '1977'} +{'loss': '0.8011', 'grad_norm': '1.557', 'learning_rate': '4.991e-05', 'epoch': '0.2345', 'num_input_tokens_seen': 19065758, 'train_runtime': '9645', 'train_tokens_per_second': '1977'} +{'loss': '0.3388', 'grad_norm': '1.118', 'learning_rate': '4.991e-05', 'epoch': '0.2345', 'num_input_tokens_seen': 19067805, 'train_runtime': '9646', 'train_tokens_per_second': '1977'} +{'loss': '0.4238', 'grad_norm': '0.8961', 'learning_rate': '4.991e-05', 'epoch': '0.2346', 'num_input_tokens_seen': 19069852, 'train_runtime': '9647', 'train_tokens_per_second': '1977'} +{'loss': '1.563', 'grad_norm': '2.246', 'learning_rate': '4.991e-05', 'epoch': '0.2346', 'num_input_tokens_seen': 19071899, 'train_runtime': '9648', 'train_tokens_per_second': '1977'} +{'loss': '0.4393', 'grad_norm': '0.9701', 'learning_rate': '4.991e-05', 'epoch': '0.2346', 'num_input_tokens_seen': 19073946, 'train_runtime': '9649', 'train_tokens_per_second': '1977'} +{'loss': '0.5545', 'grad_norm': '1.267', 'learning_rate': '4.991e-05', 'epoch': '0.2346', 'num_input_tokens_seen': 19075993, 'train_runtime': '9650', 'train_tokens_per_second': '1977'} +{'loss': '0.5505', 'grad_norm': '1.4', 'learning_rate': '4.991e-05', 'epoch': '0.2347', 'num_input_tokens_seen': 19078040, 'train_runtime': '9651', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '1.187', 'learning_rate': '4.991e-05', 'epoch': '0.2347', 'num_input_tokens_seen': 19080087, 'train_runtime': '9652', 'train_tokens_per_second': '1977'} +{'loss': '0.5142', 'grad_norm': '1.328', 'learning_rate': '4.991e-05', 'epoch': '0.2347', 'num_input_tokens_seen': 19082134, 'train_runtime': '9653', 'train_tokens_per_second': '1977'} +{'loss': '0.6035', 'grad_norm': '1.238', 'learning_rate': '4.991e-05', 'epoch': '0.2347', 'num_input_tokens_seen': 19084181, 'train_runtime': '9654', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '2.096', 'learning_rate': '4.991e-05', 'epoch': '0.2348', 'num_input_tokens_seen': 19086228, 'train_runtime': '9655', 'train_tokens_per_second': '1977'} +{'loss': '1.527', 'grad_norm': '2.645', 'learning_rate': '4.991e-05', 'epoch': '0.2348', 'num_input_tokens_seen': 19088275, 'train_runtime': '9656', 'train_tokens_per_second': '1977'} +{'loss': '0.2597', 'grad_norm': '0.7233', 'learning_rate': '4.991e-05', 'epoch': '0.2348', 'num_input_tokens_seen': 19090322, 'train_runtime': '9657', 'train_tokens_per_second': '1977'} +{'loss': '0.7466', 'grad_norm': '1.191', 'learning_rate': '4.991e-05', 'epoch': '0.2348', 'num_input_tokens_seen': 19092369, 'train_runtime': '9658', 'train_tokens_per_second': '1977'} +{'loss': '0.2658', 'grad_norm': '0.9676', 'learning_rate': '4.991e-05', 'epoch': '0.2349', 'num_input_tokens_seen': 19094416, 'train_runtime': '9659', 'train_tokens_per_second': '1977'} +{'loss': '0.7009', 'grad_norm': '0.9941', 'learning_rate': '4.991e-05', 'epoch': '0.2349', 'num_input_tokens_seen': 19096463, 'train_runtime': '9660', 'train_tokens_per_second': '1977'} +{'loss': '0.5267', 'grad_norm': '1.081', 'learning_rate': '4.991e-05', 'epoch': '0.2349', 'num_input_tokens_seen': 19098510, 'train_runtime': '9661', 'train_tokens_per_second': '1977'} +{'loss': '2.101', 'grad_norm': '2.53', 'learning_rate': '4.991e-05', 'epoch': '0.2349', 'num_input_tokens_seen': 19100557, 'train_runtime': '9662', 'train_tokens_per_second': '1977'} +{'loss': '1.406', 'grad_norm': '2.575', 'learning_rate': '4.991e-05', 'epoch': '0.235', 'num_input_tokens_seen': 19102604, 'train_runtime': '9663', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '2.119', 'learning_rate': '4.991e-05', 'epoch': '0.235', 'num_input_tokens_seen': 19104651, 'train_runtime': '9664', 'train_tokens_per_second': '1977'} +{'loss': '0.2477', 'grad_norm': '0.8216', 'learning_rate': '4.991e-05', 'epoch': '0.235', 'num_input_tokens_seen': 19106698, 'train_runtime': '9665', 'train_tokens_per_second': '1977'} +{'loss': '1.383', 'grad_norm': '2.548', 'learning_rate': '4.991e-05', 'epoch': '0.235', 'num_input_tokens_seen': 19108745, 'train_runtime': '9667', 'train_tokens_per_second': '1977'} +{'loss': '1.082', 'grad_norm': '1.64', 'learning_rate': '4.991e-05', 'epoch': '0.2351', 'num_input_tokens_seen': 19110792, 'train_runtime': '9668', 'train_tokens_per_second': '1977'} +{'loss': '1.453', 'grad_norm': '1.882', 'learning_rate': '4.991e-05', 'epoch': '0.2351', 'num_input_tokens_seen': 19112839, 'train_runtime': '9669', 'train_tokens_per_second': '1977'} +{'loss': '1.824', 'grad_norm': '2.572', 'learning_rate': '4.991e-05', 'epoch': '0.2351', 'num_input_tokens_seen': 19114886, 'train_runtime': '9670', 'train_tokens_per_second': '1977'} +{'loss': '1.219', 'grad_norm': '1.958', 'learning_rate': '4.991e-05', 'epoch': '0.2351', 'num_input_tokens_seen': 19116933, 'train_runtime': '9671', 'train_tokens_per_second': '1977'} +{'loss': '0.4672', 'grad_norm': '0.9478', 'learning_rate': '4.991e-05', 'epoch': '0.2352', 'num_input_tokens_seen': 19118980, 'train_runtime': '9672', 'train_tokens_per_second': '1977'} +{'loss': '1.084', 'grad_norm': '1.781', 'learning_rate': '4.991e-05', 'epoch': '0.2352', 'num_input_tokens_seen': 19121027, 'train_runtime': '9673', 'train_tokens_per_second': '1977'} +{'loss': '0.7212', 'grad_norm': '1.276', 'learning_rate': '4.991e-05', 'epoch': '0.2352', 'num_input_tokens_seen': 19123074, 'train_runtime': '9674', 'train_tokens_per_second': '1977'} +{'loss': '0.3524', 'grad_norm': '0.8338', 'learning_rate': '4.991e-05', 'epoch': '0.2352', 'num_input_tokens_seen': 19125121, 'train_runtime': '9675', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.593', 'learning_rate': '4.991e-05', 'epoch': '0.2353', 'num_input_tokens_seen': 19127168, 'train_runtime': '9676', 'train_tokens_per_second': '1977'} +{'loss': '0.8542', 'grad_norm': '1.498', 'learning_rate': '4.991e-05', 'epoch': '0.2353', 'num_input_tokens_seen': 19129215, 'train_runtime': '9677', 'train_tokens_per_second': '1977'} +{'loss': '0.3827', 'grad_norm': '0.8575', 'learning_rate': '4.991e-05', 'epoch': '0.2353', 'num_input_tokens_seen': 19131262, 'train_runtime': '9678', 'train_tokens_per_second': '1977'} +{'loss': '2.157', 'grad_norm': '2.957', 'learning_rate': '4.991e-05', 'epoch': '0.2353', 'num_input_tokens_seen': 19133309, 'train_runtime': '9679', 'train_tokens_per_second': '1977'} +{'loss': '1.45', 'grad_norm': '2.504', 'learning_rate': '4.991e-05', 'epoch': '0.2354', 'num_input_tokens_seen': 19135356, 'train_runtime': '9680', 'train_tokens_per_second': '1977'} +{'loss': '0.3523', 'grad_norm': '1.037', 'learning_rate': '4.991e-05', 'epoch': '0.2354', 'num_input_tokens_seen': 19137403, 'train_runtime': '9681', 'train_tokens_per_second': '1977'} +{'loss': '0.9592', 'grad_norm': '1.348', 'learning_rate': '4.991e-05', 'epoch': '0.2354', 'num_input_tokens_seen': 19139450, 'train_runtime': '9682', 'train_tokens_per_second': '1977'} +{'loss': '0.582', 'grad_norm': '1.378', 'learning_rate': '4.991e-05', 'epoch': '0.2354', 'num_input_tokens_seen': 19141497, 'train_runtime': '9683', 'train_tokens_per_second': '1977'} +{'loss': '0.8368', 'grad_norm': '1.222', 'learning_rate': '4.991e-05', 'epoch': '0.2355', 'num_input_tokens_seen': 19143544, 'train_runtime': '9684', 'train_tokens_per_second': '1977'} +{'loss': '0.3841', 'grad_norm': '1.017', 'learning_rate': '4.991e-05', 'epoch': '0.2355', 'num_input_tokens_seen': 19145591, 'train_runtime': '9685', 'train_tokens_per_second': '1977'} +{'loss': '0.7094', 'grad_norm': '1.298', 'learning_rate': '4.991e-05', 'epoch': '0.2355', 'num_input_tokens_seen': 19147638, 'train_runtime': '9686', 'train_tokens_per_second': '1977'} +{'loss': '0.3095', 'grad_norm': '0.8441', 'learning_rate': '4.991e-05', 'epoch': '0.2355', 'num_input_tokens_seen': 19149685, 'train_runtime': '9687', 'train_tokens_per_second': '1977'} +{'loss': '0.3748', 'grad_norm': '1.078', 'learning_rate': '4.991e-05', 'epoch': '0.2356', 'num_input_tokens_seen': 19151732, 'train_runtime': '9688', 'train_tokens_per_second': '1977'} +{'loss': '0.63', 'grad_norm': '0.9064', 'learning_rate': '4.991e-05', 'epoch': '0.2356', 'num_input_tokens_seen': 19153779, 'train_runtime': '9689', 'train_tokens_per_second': '1977'} +{'loss': '0.809', 'grad_norm': '1.273', 'learning_rate': '4.991e-05', 'epoch': '0.2356', 'num_input_tokens_seen': 19155826, 'train_runtime': '9690', 'train_tokens_per_second': '1977'} +{'loss': '0.3068', 'grad_norm': '0.9486', 'learning_rate': '4.991e-05', 'epoch': '0.2356', 'num_input_tokens_seen': 19157873, 'train_runtime': '9691', 'train_tokens_per_second': '1977'} +{'loss': '0.9988', 'grad_norm': '2.299', 'learning_rate': '4.991e-05', 'epoch': '0.2357', 'num_input_tokens_seen': 19159920, 'train_runtime': '9692', 'train_tokens_per_second': '1977'} +{'loss': '0.3277', 'grad_norm': '0.9455', 'learning_rate': '4.991e-05', 'epoch': '0.2357', 'num_input_tokens_seen': 19161967, 'train_runtime': '9693', 'train_tokens_per_second': '1977'} +{'loss': '0.2768', 'grad_norm': '1.001', 'learning_rate': '4.991e-05', 'epoch': '0.2357', 'num_input_tokens_seen': 19164014, 'train_runtime': '9695', 'train_tokens_per_second': '1977'} +{'loss': '0.7278', 'grad_norm': '1.614', 'learning_rate': '4.991e-05', 'epoch': '0.2357', 'num_input_tokens_seen': 19166061, 'train_runtime': '9696', 'train_tokens_per_second': '1977'} +{'loss': '0.9672', 'grad_norm': '1.545', 'learning_rate': '4.991e-05', 'epoch': '0.2358', 'num_input_tokens_seen': 19168108, 'train_runtime': '9697', 'train_tokens_per_second': '1977'} +{'loss': '0.6979', 'grad_norm': '1.306', 'learning_rate': '4.991e-05', 'epoch': '0.2358', 'num_input_tokens_seen': 19170155, 'train_runtime': '9698', 'train_tokens_per_second': '1977'} +{'loss': '0.4004', 'grad_norm': '0.9234', 'learning_rate': '4.991e-05', 'epoch': '0.2358', 'num_input_tokens_seen': 19172202, 'train_runtime': '9699', 'train_tokens_per_second': '1977'} +{'loss': '0.5276', 'grad_norm': '1.316', 'learning_rate': '4.991e-05', 'epoch': '0.2358', 'num_input_tokens_seen': 19174249, 'train_runtime': '9700', 'train_tokens_per_second': '1977'} +{'loss': '1.316', 'grad_norm': '6.672', 'learning_rate': '4.991e-05', 'epoch': '0.2359', 'num_input_tokens_seen': 19176296, 'train_runtime': '9701', 'train_tokens_per_second': '1977'} +{'loss': '0.3359', 'grad_norm': '0.7349', 'learning_rate': '4.991e-05', 'epoch': '0.2359', 'num_input_tokens_seen': 19178343, 'train_runtime': '9702', 'train_tokens_per_second': '1977'} +{'loss': '1.321', 'grad_norm': '1.649', 'learning_rate': '4.991e-05', 'epoch': '0.2359', 'num_input_tokens_seen': 19180390, 'train_runtime': '9703', 'train_tokens_per_second': '1977'} +{'loss': '0.2757', 'grad_norm': '0.7677', 'learning_rate': '4.991e-05', 'epoch': '0.2359', 'num_input_tokens_seen': 19182437, 'train_runtime': '9704', 'train_tokens_per_second': '1977'} +{'loss': '0.923', 'grad_norm': '1.737', 'learning_rate': '4.991e-05', 'epoch': '0.236', 'num_input_tokens_seen': 19184484, 'train_runtime': '9705', 'train_tokens_per_second': '1977'} +{'loss': '1.029', 'grad_norm': '1.73', 'learning_rate': '4.991e-05', 'epoch': '0.236', 'num_input_tokens_seen': 19186531, 'train_runtime': '9706', 'train_tokens_per_second': '1977'} +{'loss': '0.3816', 'grad_norm': '0.918', 'learning_rate': '4.991e-05', 'epoch': '0.236', 'num_input_tokens_seen': 19188578, 'train_runtime': '9707', 'train_tokens_per_second': '1977'} +{'loss': '1.207', 'grad_norm': '1.788', 'learning_rate': '4.991e-05', 'epoch': '0.236', 'num_input_tokens_seen': 19190625, 'train_runtime': '9708', 'train_tokens_per_second': '1977'} +{'loss': '0.597', 'grad_norm': '0.9907', 'learning_rate': '4.99e-05', 'epoch': '0.2361', 'num_input_tokens_seen': 19192672, 'train_runtime': '9709', 'train_tokens_per_second': '1977'} +{'loss': '1.257', 'grad_norm': '2.074', 'learning_rate': '4.99e-05', 'epoch': '0.2361', 'num_input_tokens_seen': 19194719, 'train_runtime': '9710', 'train_tokens_per_second': '1977'} +{'loss': '0.5584', 'grad_norm': '1.2', 'learning_rate': '4.99e-05', 'epoch': '0.2361', 'num_input_tokens_seen': 19196766, 'train_runtime': '9711', 'train_tokens_per_second': '1977'} +{'loss': '1.044', 'grad_norm': '1.613', 'learning_rate': '4.99e-05', 'epoch': '0.2361', 'num_input_tokens_seen': 19198813, 'train_runtime': '9712', 'train_tokens_per_second': '1977'} +{'loss': '0.2185', 'grad_norm': '0.8604', 'learning_rate': '4.99e-05', 'epoch': '0.2362', 'num_input_tokens_seen': 19200860, 'train_runtime': '9713', 'train_tokens_per_second': '1977'} +{'loss': '0.5653', 'grad_norm': '1.292', 'learning_rate': '4.99e-05', 'epoch': '0.2362', 'num_input_tokens_seen': 19202907, 'train_runtime': '9714', 'train_tokens_per_second': '1977'} +{'loss': '0.5567', 'grad_norm': '1.111', 'learning_rate': '4.99e-05', 'epoch': '0.2362', 'num_input_tokens_seen': 19204954, 'train_runtime': '9715', 'train_tokens_per_second': '1977'} +{'loss': '0.7352', 'grad_norm': '1.443', 'learning_rate': '4.99e-05', 'epoch': '0.2362', 'num_input_tokens_seen': 19207001, 'train_runtime': '9716', 'train_tokens_per_second': '1977'} +{'loss': '0.8303', 'grad_norm': '1.263', 'learning_rate': '4.99e-05', 'epoch': '0.2363', 'num_input_tokens_seen': 19209048, 'train_runtime': '9717', 'train_tokens_per_second': '1977'} +{'loss': '0.3234', 'grad_norm': '1.036', 'learning_rate': '4.99e-05', 'epoch': '0.2363', 'num_input_tokens_seen': 19211095, 'train_runtime': '9718', 'train_tokens_per_second': '1977'} +{'loss': '0.2749', 'grad_norm': '0.9382', 'learning_rate': '4.99e-05', 'epoch': '0.2363', 'num_input_tokens_seen': 19213142, 'train_runtime': '9719', 'train_tokens_per_second': '1977'} +{'loss': '1.237', 'grad_norm': '2.3', 'learning_rate': '4.99e-05', 'epoch': '0.2363', 'num_input_tokens_seen': 19215189, 'train_runtime': '9720', 'train_tokens_per_second': '1977'} +{'loss': '0.9756', 'grad_norm': '1.654', 'learning_rate': '4.99e-05', 'epoch': '0.2364', 'num_input_tokens_seen': 19217236, 'train_runtime': '9721', 'train_tokens_per_second': '1977'} +{'loss': '1.561', 'grad_norm': '3.291', 'learning_rate': '4.99e-05', 'epoch': '0.2364', 'num_input_tokens_seen': 19219283, 'train_runtime': '9722', 'train_tokens_per_second': '1977'} +{'loss': '0.4639', 'grad_norm': '1.087', 'learning_rate': '4.99e-05', 'epoch': '0.2364', 'num_input_tokens_seen': 19221330, 'train_runtime': '9723', 'train_tokens_per_second': '1977'} +{'loss': '0.4996', 'grad_norm': '1.325', 'learning_rate': '4.99e-05', 'epoch': '0.2364', 'num_input_tokens_seen': 19223377, 'train_runtime': '9725', 'train_tokens_per_second': '1977'} +{'loss': '0.3668', 'grad_norm': '0.9988', 'learning_rate': '4.99e-05', 'epoch': '0.2365', 'num_input_tokens_seen': 19225424, 'train_runtime': '9726', 'train_tokens_per_second': '1977'} +{'loss': '0.3362', 'grad_norm': '0.7703', 'learning_rate': '4.99e-05', 'epoch': '0.2365', 'num_input_tokens_seen': 19227471, 'train_runtime': '9727', 'train_tokens_per_second': '1977'} +{'loss': '0.4996', 'grad_norm': '1.186', 'learning_rate': '4.99e-05', 'epoch': '0.2365', 'num_input_tokens_seen': 19229518, 'train_runtime': '9728', 'train_tokens_per_second': '1977'} +{'loss': '0.6536', 'grad_norm': '1.154', 'learning_rate': '4.99e-05', 'epoch': '0.2365', 'num_input_tokens_seen': 19231565, 'train_runtime': '9729', 'train_tokens_per_second': '1977'} +{'loss': '0.3038', 'grad_norm': '0.8514', 'learning_rate': '4.99e-05', 'epoch': '0.2366', 'num_input_tokens_seen': 19233612, 'train_runtime': '9730', 'train_tokens_per_second': '1977'} +{'loss': '0.4711', 'grad_norm': '1.141', 'learning_rate': '4.99e-05', 'epoch': '0.2366', 'num_input_tokens_seen': 19235659, 'train_runtime': '9731', 'train_tokens_per_second': '1977'} +{'loss': '1.112', 'grad_norm': '1.556', 'learning_rate': '4.99e-05', 'epoch': '0.2366', 'num_input_tokens_seen': 19237706, 'train_runtime': '9732', 'train_tokens_per_second': '1977'} +{'loss': '0.3819', 'grad_norm': '0.9949', 'learning_rate': '4.99e-05', 'epoch': '0.2366', 'num_input_tokens_seen': 19239753, 'train_runtime': '9733', 'train_tokens_per_second': '1977'} +{'loss': '0.7998', 'grad_norm': '1.207', 'learning_rate': '4.99e-05', 'epoch': '0.2367', 'num_input_tokens_seen': 19241800, 'train_runtime': '9734', 'train_tokens_per_second': '1977'} +{'loss': '1.544', 'grad_norm': '2.53', 'learning_rate': '4.99e-05', 'epoch': '0.2367', 'num_input_tokens_seen': 19243847, 'train_runtime': '9735', 'train_tokens_per_second': '1977'} +{'loss': '0.3604', 'grad_norm': '0.8835', 'learning_rate': '4.99e-05', 'epoch': '0.2367', 'num_input_tokens_seen': 19245894, 'train_runtime': '9736', 'train_tokens_per_second': '1977'} +{'loss': '0.5424', 'grad_norm': '1.433', 'learning_rate': '4.99e-05', 'epoch': '0.2368', 'num_input_tokens_seen': 19247941, 'train_runtime': '9737', 'train_tokens_per_second': '1977'} +{'loss': '1.275', 'grad_norm': '2.302', 'learning_rate': '4.99e-05', 'epoch': '0.2368', 'num_input_tokens_seen': 19249988, 'train_runtime': '9738', 'train_tokens_per_second': '1977'} +{'loss': '0.8257', 'grad_norm': '1.202', 'learning_rate': '4.99e-05', 'epoch': '0.2368', 'num_input_tokens_seen': 19252035, 'train_runtime': '9739', 'train_tokens_per_second': '1977'} +{'loss': '0.3523', 'grad_norm': '0.8384', 'learning_rate': '4.99e-05', 'epoch': '0.2368', 'num_input_tokens_seen': 19254082, 'train_runtime': '9740', 'train_tokens_per_second': '1977'} +{'loss': '1.649', 'grad_norm': '2.565', 'learning_rate': '4.99e-05', 'epoch': '0.2369', 'num_input_tokens_seen': 19256129, 'train_runtime': '9741', 'train_tokens_per_second': '1977'} +{'loss': '1.654', 'grad_norm': '2.555', 'learning_rate': '4.99e-05', 'epoch': '0.2369', 'num_input_tokens_seen': 19258176, 'train_runtime': '9742', 'train_tokens_per_second': '1977'} +{'loss': '1.183', 'grad_norm': '2.109', 'learning_rate': '4.99e-05', 'epoch': '0.2369', 'num_input_tokens_seen': 19260223, 'train_runtime': '9743', 'train_tokens_per_second': '1977'} +{'loss': '0.6638', 'grad_norm': '1.404', 'learning_rate': '4.99e-05', 'epoch': '0.2369', 'num_input_tokens_seen': 19262270, 'train_runtime': '9744', 'train_tokens_per_second': '1977'} +{'loss': '0.7325', 'grad_norm': '1.193', 'learning_rate': '4.99e-05', 'epoch': '0.237', 'num_input_tokens_seen': 19264317, 'train_runtime': '9745', 'train_tokens_per_second': '1977'} +{'loss': '1.016', 'grad_norm': '1.711', 'learning_rate': '4.99e-05', 'epoch': '0.237', 'num_input_tokens_seen': 19266364, 'train_runtime': '9746', 'train_tokens_per_second': '1977'} +{'loss': '0.1698', 'grad_norm': '0.7493', 'learning_rate': '4.99e-05', 'epoch': '0.237', 'num_input_tokens_seen': 19268411, 'train_runtime': '9747', 'train_tokens_per_second': '1977'} +{'loss': '0.6852', 'grad_norm': '1.612', 'learning_rate': '4.99e-05', 'epoch': '0.237', 'num_input_tokens_seen': 19270458, 'train_runtime': '9748', 'train_tokens_per_second': '1977'} +{'loss': '0.7834', 'grad_norm': '1.348', 'learning_rate': '4.99e-05', 'epoch': '0.2371', 'num_input_tokens_seen': 19272505, 'train_runtime': '9749', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.845', 'learning_rate': '4.99e-05', 'epoch': '0.2371', 'num_input_tokens_seen': 19274552, 'train_runtime': '9750', 'train_tokens_per_second': '1977'} +{'loss': '0.5533', 'grad_norm': '1.158', 'learning_rate': '4.99e-05', 'epoch': '0.2371', 'num_input_tokens_seen': 19276599, 'train_runtime': '9751', 'train_tokens_per_second': '1977'} +{'loss': '0.5961', 'grad_norm': '1.042', 'learning_rate': '4.99e-05', 'epoch': '0.2371', 'num_input_tokens_seen': 19278646, 'train_runtime': '9752', 'train_tokens_per_second': '1977'} +{'loss': '0.979', 'grad_norm': '1.391', 'learning_rate': '4.99e-05', 'epoch': '0.2372', 'num_input_tokens_seen': 19280693, 'train_runtime': '9753', 'train_tokens_per_second': '1977'} +{'loss': '0.5964', 'grad_norm': '1.003', 'learning_rate': '4.99e-05', 'epoch': '0.2372', 'num_input_tokens_seen': 19282740, 'train_runtime': '9755', 'train_tokens_per_second': '1977'} +{'loss': '0.5', 'grad_norm': '1.505', 'learning_rate': '4.99e-05', 'epoch': '0.2372', 'num_input_tokens_seen': 19284787, 'train_runtime': '9756', 'train_tokens_per_second': '1977'} +{'loss': '1.615', 'grad_norm': '3.243', 'learning_rate': '4.99e-05', 'epoch': '0.2372', 'num_input_tokens_seen': 19286834, 'train_runtime': '9757', 'train_tokens_per_second': '1977'} +{'loss': '0.5415', 'grad_norm': '0.9468', 'learning_rate': '4.99e-05', 'epoch': '0.2373', 'num_input_tokens_seen': 19288881, 'train_runtime': '9758', 'train_tokens_per_second': '1977'} +{'loss': '0.4985', 'grad_norm': '1.485', 'learning_rate': '4.99e-05', 'epoch': '0.2373', 'num_input_tokens_seen': 19290928, 'train_runtime': '9759', 'train_tokens_per_second': '1977'} +{'loss': '0.3558', 'grad_norm': '1.156', 'learning_rate': '4.99e-05', 'epoch': '0.2373', 'num_input_tokens_seen': 19292975, 'train_runtime': '9760', 'train_tokens_per_second': '1977'} +{'loss': '0.3172', 'grad_norm': '1.016', 'learning_rate': '4.99e-05', 'epoch': '0.2373', 'num_input_tokens_seen': 19295022, 'train_runtime': '9761', 'train_tokens_per_second': '1977'} +{'loss': '1.656', 'grad_norm': '2.444', 'learning_rate': '4.99e-05', 'epoch': '0.2374', 'num_input_tokens_seen': 19297069, 'train_runtime': '9762', 'train_tokens_per_second': '1977'} +{'loss': '0.2645', 'grad_norm': '0.9818', 'learning_rate': '4.99e-05', 'epoch': '0.2374', 'num_input_tokens_seen': 19299116, 'train_runtime': '9763', 'train_tokens_per_second': '1977'} +{'loss': '0.6', 'grad_norm': '0.9121', 'learning_rate': '4.99e-05', 'epoch': '0.2374', 'num_input_tokens_seen': 19301163, 'train_runtime': '9764', 'train_tokens_per_second': '1977'} +{'loss': '0.341', 'grad_norm': '0.769', 'learning_rate': '4.99e-05', 'epoch': '0.2374', 'num_input_tokens_seen': 19303210, 'train_runtime': '9765', 'train_tokens_per_second': '1977'} +{'loss': '0.8846', 'grad_norm': '1.676', 'learning_rate': '4.99e-05', 'epoch': '0.2375', 'num_input_tokens_seen': 19305257, 'train_runtime': '9766', 'train_tokens_per_second': '1977'} +{'loss': '0.7629', 'grad_norm': '1.529', 'learning_rate': '4.99e-05', 'epoch': '0.2375', 'num_input_tokens_seen': 19307304, 'train_runtime': '9767', 'train_tokens_per_second': '1977'} +{'loss': '0.7244', 'grad_norm': '1.09', 'learning_rate': '4.99e-05', 'epoch': '0.2375', 'num_input_tokens_seen': 19309351, 'train_runtime': '9768', 'train_tokens_per_second': '1977'} +{'loss': '0.7636', 'grad_norm': '1.222', 'learning_rate': '4.99e-05', 'epoch': '0.2375', 'num_input_tokens_seen': 19311398, 'train_runtime': '9769', 'train_tokens_per_second': '1977'} +{'loss': '0.196', 'grad_norm': '0.8802', 'learning_rate': '4.99e-05', 'epoch': '0.2376', 'num_input_tokens_seen': 19313445, 'train_runtime': '9770', 'train_tokens_per_second': '1977'} +{'loss': '0.36', 'grad_norm': '0.8309', 'learning_rate': '4.99e-05', 'epoch': '0.2376', 'num_input_tokens_seen': 19315492, 'train_runtime': '9771', 'train_tokens_per_second': '1977'} +{'loss': '0.6005', 'grad_norm': '1.237', 'learning_rate': '4.99e-05', 'epoch': '0.2376', 'num_input_tokens_seen': 19317539, 'train_runtime': '9772', 'train_tokens_per_second': '1977'} +{'loss': '0.4527', 'grad_norm': '0.8072', 'learning_rate': '4.99e-05', 'epoch': '0.2376', 'num_input_tokens_seen': 19319586, 'train_runtime': '9773', 'train_tokens_per_second': '1977'} +{'loss': '0.5793', 'grad_norm': '1.282', 'learning_rate': '4.99e-05', 'epoch': '0.2377', 'num_input_tokens_seen': 19321633, 'train_runtime': '9774', 'train_tokens_per_second': '1977'} +{'loss': '0.7055', 'grad_norm': '1.424', 'learning_rate': '4.99e-05', 'epoch': '0.2377', 'num_input_tokens_seen': 19323680, 'train_runtime': '9775', 'train_tokens_per_second': '1977'} +{'loss': '0.9017', 'grad_norm': '1.23', 'learning_rate': '4.99e-05', 'epoch': '0.2377', 'num_input_tokens_seen': 19325727, 'train_runtime': '9776', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '2.057', 'learning_rate': '4.99e-05', 'epoch': '0.2377', 'num_input_tokens_seen': 19327774, 'train_runtime': '9777', 'train_tokens_per_second': '1977'} +{'loss': '1.312', 'grad_norm': '1.838', 'learning_rate': '4.99e-05', 'epoch': '0.2378', 'num_input_tokens_seen': 19329821, 'train_runtime': '9778', 'train_tokens_per_second': '1977'} +{'loss': '1.231', 'grad_norm': '2.152', 'learning_rate': '4.99e-05', 'epoch': '0.2378', 'num_input_tokens_seen': 19331868, 'train_runtime': '9779', 'train_tokens_per_second': '1977'} +{'loss': '1.319', 'grad_norm': '2.027', 'learning_rate': '4.99e-05', 'epoch': '0.2378', 'num_input_tokens_seen': 19333915, 'train_runtime': '9780', 'train_tokens_per_second': '1977'} +{'loss': '0.6095', 'grad_norm': '1.305', 'learning_rate': '4.99e-05', 'epoch': '0.2378', 'num_input_tokens_seen': 19335962, 'train_runtime': '9781', 'train_tokens_per_second': '1977'} +{'loss': '1.327', 'grad_norm': '2.062', 'learning_rate': '4.99e-05', 'epoch': '0.2379', 'num_input_tokens_seen': 19338009, 'train_runtime': '9782', 'train_tokens_per_second': '1977'} +{'loss': '0.3964', 'grad_norm': '1.144', 'learning_rate': '4.99e-05', 'epoch': '0.2379', 'num_input_tokens_seen': 19340056, 'train_runtime': '9783', 'train_tokens_per_second': '1977'} +{'loss': '0.2635', 'grad_norm': '0.9323', 'learning_rate': '4.99e-05', 'epoch': '0.2379', 'num_input_tokens_seen': 19342103, 'train_runtime': '9785', 'train_tokens_per_second': '1977'} +{'loss': '0.8076', 'grad_norm': '1.156', 'learning_rate': '4.99e-05', 'epoch': '0.2379', 'num_input_tokens_seen': 19344150, 'train_runtime': '9786', 'train_tokens_per_second': '1977'} +{'loss': '0.9117', 'grad_norm': '1.854', 'learning_rate': '4.99e-05', 'epoch': '0.238', 'num_input_tokens_seen': 19346197, 'train_runtime': '9787', 'train_tokens_per_second': '1977'} +{'loss': '0.6447', 'grad_norm': '1.506', 'learning_rate': '4.99e-05', 'epoch': '0.238', 'num_input_tokens_seen': 19348244, 'train_runtime': '9788', 'train_tokens_per_second': '1977'} +{'loss': '0.2909', 'grad_norm': '0.7843', 'learning_rate': '4.99e-05', 'epoch': '0.238', 'num_input_tokens_seen': 19350291, 'train_runtime': '9789', 'train_tokens_per_second': '1977'} +{'loss': '0.9381', 'grad_norm': '1.431', 'learning_rate': '4.99e-05', 'epoch': '0.238', 'num_input_tokens_seen': 19352338, 'train_runtime': '9790', 'train_tokens_per_second': '1977'} +{'loss': '0.4143', 'grad_norm': '1.028', 'learning_rate': '4.99e-05', 'epoch': '0.2381', 'num_input_tokens_seen': 19354385, 'train_runtime': '9791', 'train_tokens_per_second': '1977'} +{'loss': '0.2803', 'grad_norm': '1.049', 'learning_rate': '4.99e-05', 'epoch': '0.2381', 'num_input_tokens_seen': 19356432, 'train_runtime': '9792', 'train_tokens_per_second': '1977'} +{'loss': '0.5327', 'grad_norm': '1.007', 'learning_rate': '4.99e-05', 'epoch': '0.2381', 'num_input_tokens_seen': 19358479, 'train_runtime': '9793', 'train_tokens_per_second': '1977'} +{'loss': '0.6458', 'grad_norm': '1.416', 'learning_rate': '4.99e-05', 'epoch': '0.2381', 'num_input_tokens_seen': 19360526, 'train_runtime': '9794', 'train_tokens_per_second': '1977'} +{'loss': '0.314', 'grad_norm': '1.03', 'learning_rate': '4.99e-05', 'epoch': '0.2382', 'num_input_tokens_seen': 19362573, 'train_runtime': '9795', 'train_tokens_per_second': '1977'} +{'loss': '1.414', 'grad_norm': '2.192', 'learning_rate': '4.99e-05', 'epoch': '0.2382', 'num_input_tokens_seen': 19364620, 'train_runtime': '9796', 'train_tokens_per_second': '1977'} +{'loss': '0.3601', 'grad_norm': '1.011', 'learning_rate': '4.99e-05', 'epoch': '0.2382', 'num_input_tokens_seen': 19366667, 'train_runtime': '9797', 'train_tokens_per_second': '1977'} +{'loss': '0.5176', 'grad_norm': '1.015', 'learning_rate': '4.99e-05', 'epoch': '0.2382', 'num_input_tokens_seen': 19368714, 'train_runtime': '9798', 'train_tokens_per_second': '1977'} +{'loss': '0.3526', 'grad_norm': '1.043', 'learning_rate': '4.99e-05', 'epoch': '0.2383', 'num_input_tokens_seen': 19370761, 'train_runtime': '9799', 'train_tokens_per_second': '1977'} +{'loss': '0.8317', 'grad_norm': '1.471', 'learning_rate': '4.99e-05', 'epoch': '0.2383', 'num_input_tokens_seen': 19372808, 'train_runtime': '9800', 'train_tokens_per_second': '1977'} +{'loss': '0.1808', 'grad_norm': '0.9376', 'learning_rate': '4.99e-05', 'epoch': '0.2383', 'num_input_tokens_seen': 19374855, 'train_runtime': '9801', 'train_tokens_per_second': '1977'} +{'loss': '0.3573', 'grad_norm': '0.8984', 'learning_rate': '4.99e-05', 'epoch': '0.2383', 'num_input_tokens_seen': 19376902, 'train_runtime': '9802', 'train_tokens_per_second': '1977'} +{'loss': '0.4282', 'grad_norm': '1.231', 'learning_rate': '4.99e-05', 'epoch': '0.2384', 'num_input_tokens_seen': 19378949, 'train_runtime': '9803', 'train_tokens_per_second': '1977'} +{'loss': '1.457', 'grad_norm': '1.889', 'learning_rate': '4.99e-05', 'epoch': '0.2384', 'num_input_tokens_seen': 19380996, 'train_runtime': '9804', 'train_tokens_per_second': '1977'} +{'loss': '0.5147', 'grad_norm': '1.175', 'learning_rate': '4.99e-05', 'epoch': '0.2384', 'num_input_tokens_seen': 19383043, 'train_runtime': '9805', 'train_tokens_per_second': '1977'} +{'loss': '0.657', 'grad_norm': '0.925', 'learning_rate': '4.99e-05', 'epoch': '0.2384', 'num_input_tokens_seen': 19385090, 'train_runtime': '9806', 'train_tokens_per_second': '1977'} +{'loss': '0.9457', 'grad_norm': '1.884', 'learning_rate': '4.99e-05', 'epoch': '0.2385', 'num_input_tokens_seen': 19387137, 'train_runtime': '9807', 'train_tokens_per_second': '1977'} +{'loss': '1.151', 'grad_norm': '1.041', 'learning_rate': '4.99e-05', 'epoch': '0.2385', 'num_input_tokens_seen': 19389184, 'train_runtime': '9808', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '1.424', 'learning_rate': '4.99e-05', 'epoch': '0.2385', 'num_input_tokens_seen': 19391231, 'train_runtime': '9809', 'train_tokens_per_second': '1977'} +{'loss': '0.5248', 'grad_norm': '0.9247', 'learning_rate': '4.99e-05', 'epoch': '0.2385', 'num_input_tokens_seen': 19393278, 'train_runtime': '9810', 'train_tokens_per_second': '1977'} +{'loss': '0.1846', 'grad_norm': '0.8499', 'learning_rate': '4.99e-05', 'epoch': '0.2386', 'num_input_tokens_seen': 19395325, 'train_runtime': '9811', 'train_tokens_per_second': '1977'} +{'loss': '0.3696', 'grad_norm': '1.029', 'learning_rate': '4.99e-05', 'epoch': '0.2386', 'num_input_tokens_seen': 19397372, 'train_runtime': '9812', 'train_tokens_per_second': '1977'} +{'loss': '0.4783', 'grad_norm': '1.029', 'learning_rate': '4.99e-05', 'epoch': '0.2386', 'num_input_tokens_seen': 19399419, 'train_runtime': '9813', 'train_tokens_per_second': '1977'} +{'loss': '1.604', 'grad_norm': '2.793', 'learning_rate': '4.99e-05', 'epoch': '0.2386', 'num_input_tokens_seen': 19401466, 'train_runtime': '9815', 'train_tokens_per_second': '1977'} +{'loss': '1.675', 'grad_norm': '2.109', 'learning_rate': '4.99e-05', 'epoch': '0.2387', 'num_input_tokens_seen': 19403513, 'train_runtime': '9816', 'train_tokens_per_second': '1977'} +{'loss': '0.3058', 'grad_norm': '0.8291', 'learning_rate': '4.99e-05', 'epoch': '0.2387', 'num_input_tokens_seen': 19405560, 'train_runtime': '9817', 'train_tokens_per_second': '1977'} +{'loss': '0.5753', 'grad_norm': '1.011', 'learning_rate': '4.99e-05', 'epoch': '0.2387', 'num_input_tokens_seen': 19407607, 'train_runtime': '9818', 'train_tokens_per_second': '1977'} +{'loss': '0.4315', 'grad_norm': '1.212', 'learning_rate': '4.99e-05', 'epoch': '0.2387', 'num_input_tokens_seen': 19409654, 'train_runtime': '9819', 'train_tokens_per_second': '1977'} +{'loss': '0.4875', 'grad_norm': '1.306', 'learning_rate': '4.99e-05', 'epoch': '0.2388', 'num_input_tokens_seen': 19411701, 'train_runtime': '9820', 'train_tokens_per_second': '1977'} +{'loss': '0.453', 'grad_norm': '1.013', 'learning_rate': '4.99e-05', 'epoch': '0.2388', 'num_input_tokens_seen': 19413748, 'train_runtime': '9821', 'train_tokens_per_second': '1977'} +{'loss': '0.801', 'grad_norm': '1.13', 'learning_rate': '4.99e-05', 'epoch': '0.2388', 'num_input_tokens_seen': 19415795, 'train_runtime': '9822', 'train_tokens_per_second': '1977'} +{'loss': '1.234', 'grad_norm': '2.366', 'learning_rate': '4.99e-05', 'epoch': '0.2388', 'num_input_tokens_seen': 19417842, 'train_runtime': '9823', 'train_tokens_per_second': '1977'} +{'loss': '0.7236', 'grad_norm': '1.242', 'learning_rate': '4.99e-05', 'epoch': '0.2389', 'num_input_tokens_seen': 19419889, 'train_runtime': '9824', 'train_tokens_per_second': '1977'} +{'loss': '0.9839', 'grad_norm': '1.813', 'learning_rate': '4.99e-05', 'epoch': '0.2389', 'num_input_tokens_seen': 19421936, 'train_runtime': '9825', 'train_tokens_per_second': '1977'} +{'loss': '0.9408', 'grad_norm': '1.496', 'learning_rate': '4.99e-05', 'epoch': '0.2389', 'num_input_tokens_seen': 19423983, 'train_runtime': '9826', 'train_tokens_per_second': '1977'} +{'loss': '0.4526', 'grad_norm': '0.9427', 'learning_rate': '4.99e-05', 'epoch': '0.2389', 'num_input_tokens_seen': 19426030, 'train_runtime': '9827', 'train_tokens_per_second': '1977'} +{'loss': '1.124', 'grad_norm': '1.777', 'learning_rate': '4.99e-05', 'epoch': '0.239', 'num_input_tokens_seen': 19428077, 'train_runtime': '9828', 'train_tokens_per_second': '1977'} +{'loss': '0.492', 'grad_norm': '0.8947', 'learning_rate': '4.99e-05', 'epoch': '0.239', 'num_input_tokens_seen': 19430124, 'train_runtime': '9829', 'train_tokens_per_second': '1977'} +{'loss': '0.2661', 'grad_norm': '0.9087', 'learning_rate': '4.99e-05', 'epoch': '0.239', 'num_input_tokens_seen': 19432171, 'train_runtime': '9830', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '1.1', 'learning_rate': '4.99e-05', 'epoch': '0.239', 'num_input_tokens_seen': 19434218, 'train_runtime': '9831', 'train_tokens_per_second': '1977'} +{'loss': '0.3158', 'grad_norm': '1.084', 'learning_rate': '4.99e-05', 'epoch': '0.2391', 'num_input_tokens_seen': 19436265, 'train_runtime': '9832', 'train_tokens_per_second': '1977'} +{'loss': '0.7562', 'grad_norm': '1.634', 'learning_rate': '4.99e-05', 'epoch': '0.2391', 'num_input_tokens_seen': 19438312, 'train_runtime': '9833', 'train_tokens_per_second': '1977'} +{'loss': '0.7312', 'grad_norm': '1.219', 'learning_rate': '4.99e-05', 'epoch': '0.2391', 'num_input_tokens_seen': 19440359, 'train_runtime': '9834', 'train_tokens_per_second': '1977'} +{'loss': '1.047', 'grad_norm': '1.764', 'learning_rate': '4.99e-05', 'epoch': '0.2391', 'num_input_tokens_seen': 19442406, 'train_runtime': '9835', 'train_tokens_per_second': '1977'} +{'loss': '0.4121', 'grad_norm': '0.959', 'learning_rate': '4.99e-05', 'epoch': '0.2392', 'num_input_tokens_seen': 19444453, 'train_runtime': '9836', 'train_tokens_per_second': '1977'} +{'loss': '0.976', 'grad_norm': '1.55', 'learning_rate': '4.99e-05', 'epoch': '0.2392', 'num_input_tokens_seen': 19446500, 'train_runtime': '9837', 'train_tokens_per_second': '1977'} +{'loss': '0.384', 'grad_norm': '0.8676', 'learning_rate': '4.99e-05', 'epoch': '0.2392', 'num_input_tokens_seen': 19448547, 'train_runtime': '9838', 'train_tokens_per_second': '1977'} +{'loss': '0.5705', 'grad_norm': '1.301', 'learning_rate': '4.99e-05', 'epoch': '0.2392', 'num_input_tokens_seen': 19450594, 'train_runtime': '9839', 'train_tokens_per_second': '1977'} +{'loss': '0.6202', 'grad_norm': '1.28', 'learning_rate': '4.99e-05', 'epoch': '0.2393', 'num_input_tokens_seen': 19452641, 'train_runtime': '9840', 'train_tokens_per_second': '1977'} +{'loss': '0.6203', 'grad_norm': '1.197', 'learning_rate': '4.99e-05', 'epoch': '0.2393', 'num_input_tokens_seen': 19454688, 'train_runtime': '9841', 'train_tokens_per_second': '1977'} +{'loss': '0.3187', 'grad_norm': '1.037', 'learning_rate': '4.99e-05', 'epoch': '0.2393', 'num_input_tokens_seen': 19456735, 'train_runtime': '9842', 'train_tokens_per_second': '1977'} +{'loss': '1.746', 'grad_norm': '2.699', 'learning_rate': '4.99e-05', 'epoch': '0.2393', 'num_input_tokens_seen': 19458782, 'train_runtime': '9843', 'train_tokens_per_second': '1977'} +{'loss': '0.2685', 'grad_norm': '0.9169', 'learning_rate': '4.99e-05', 'epoch': '0.2394', 'num_input_tokens_seen': 19460829, 'train_runtime': '9844', 'train_tokens_per_second': '1977'} +{'loss': '1.474', 'grad_norm': '1.88', 'learning_rate': '4.99e-05', 'epoch': '0.2394', 'num_input_tokens_seen': 19462876, 'train_runtime': '9846', 'train_tokens_per_second': '1977'} +{'loss': '0.7372', 'grad_norm': '1.439', 'learning_rate': '4.99e-05', 'epoch': '0.2394', 'num_input_tokens_seen': 19464923, 'train_runtime': '9847', 'train_tokens_per_second': '1977'} +{'loss': '0.3222', 'grad_norm': '0.8573', 'learning_rate': '4.99e-05', 'epoch': '0.2394', 'num_input_tokens_seen': 19466970, 'train_runtime': '9848', 'train_tokens_per_second': '1977'} +{'loss': '0.8541', 'grad_norm': '1.758', 'learning_rate': '4.99e-05', 'epoch': '0.2395', 'num_input_tokens_seen': 19469017, 'train_runtime': '9849', 'train_tokens_per_second': '1977'} +{'loss': '0.7957', 'grad_norm': '1.228', 'learning_rate': '4.99e-05', 'epoch': '0.2395', 'num_input_tokens_seen': 19471064, 'train_runtime': '9850', 'train_tokens_per_second': '1977'} +{'loss': '0.8215', 'grad_norm': '1.274', 'learning_rate': '4.99e-05', 'epoch': '0.2395', 'num_input_tokens_seen': 19473111, 'train_runtime': '9851', 'train_tokens_per_second': '1977'} +{'loss': '0.5372', 'grad_norm': '1.274', 'learning_rate': '4.99e-05', 'epoch': '0.2395', 'num_input_tokens_seen': 19475158, 'train_runtime': '9852', 'train_tokens_per_second': '1977'} +{'loss': '1.544', 'grad_norm': '2.532', 'learning_rate': '4.99e-05', 'epoch': '0.2396', 'num_input_tokens_seen': 19477205, 'train_runtime': '9853', 'train_tokens_per_second': '1977'} +{'loss': '0.5469', 'grad_norm': '1.109', 'learning_rate': '4.99e-05', 'epoch': '0.2396', 'num_input_tokens_seen': 19479252, 'train_runtime': '9854', 'train_tokens_per_second': '1977'} +{'loss': '0.6108', 'grad_norm': '1.31', 'learning_rate': '4.99e-05', 'epoch': '0.2396', 'num_input_tokens_seen': 19481299, 'train_runtime': '9855', 'train_tokens_per_second': '1977'} +{'loss': '1.033', 'grad_norm': '1.58', 'learning_rate': '4.99e-05', 'epoch': '0.2396', 'num_input_tokens_seen': 19483346, 'train_runtime': '9856', 'train_tokens_per_second': '1977'} +{'loss': '0.5567', 'grad_norm': '1.305', 'learning_rate': '4.99e-05', 'epoch': '0.2397', 'num_input_tokens_seen': 19485393, 'train_runtime': '9857', 'train_tokens_per_second': '1977'} +{'loss': '0.6518', 'grad_norm': '1.186', 'learning_rate': '4.99e-05', 'epoch': '0.2397', 'num_input_tokens_seen': 19487440, 'train_runtime': '9858', 'train_tokens_per_second': '1977'} +{'loss': '0.3741', 'grad_norm': '0.8901', 'learning_rate': '4.99e-05', 'epoch': '0.2397', 'num_input_tokens_seen': 19489487, 'train_runtime': '9859', 'train_tokens_per_second': '1977'} +{'loss': '0.8395', 'grad_norm': '1.631', 'learning_rate': '4.99e-05', 'epoch': '0.2397', 'num_input_tokens_seen': 19491534, 'train_runtime': '9860', 'train_tokens_per_second': '1977'} +{'loss': '0.7888', 'grad_norm': '1.372', 'learning_rate': '4.99e-05', 'epoch': '0.2398', 'num_input_tokens_seen': 19493581, 'train_runtime': '9861', 'train_tokens_per_second': '1977'} +{'loss': '1.782', 'grad_norm': '2.526', 'learning_rate': '4.99e-05', 'epoch': '0.2398', 'num_input_tokens_seen': 19495628, 'train_runtime': '9862', 'train_tokens_per_second': '1977'} +{'loss': '0.1902', 'grad_norm': '0.8008', 'learning_rate': '4.99e-05', 'epoch': '0.2398', 'num_input_tokens_seen': 19497675, 'train_runtime': '9863', 'train_tokens_per_second': '1977'} +{'loss': '0.3307', 'grad_norm': '0.7948', 'learning_rate': '4.99e-05', 'epoch': '0.2398', 'num_input_tokens_seen': 19499722, 'train_runtime': '9864', 'train_tokens_per_second': '1977'} +{'loss': '0.5177', 'grad_norm': '1.052', 'learning_rate': '4.99e-05', 'epoch': '0.2399', 'num_input_tokens_seen': 19501769, 'train_runtime': '9865', 'train_tokens_per_second': '1977'} +{'loss': '0.6065', 'grad_norm': '1.387', 'learning_rate': '4.99e-05', 'epoch': '0.2399', 'num_input_tokens_seen': 19503816, 'train_runtime': '9866', 'train_tokens_per_second': '1977'} +{'loss': '0.4066', 'grad_norm': '1.053', 'learning_rate': '4.99e-05', 'epoch': '0.2399', 'num_input_tokens_seen': 19505863, 'train_runtime': '9867', 'train_tokens_per_second': '1977'} +{'loss': '0.7658', 'grad_norm': '1.231', 'learning_rate': '4.99e-05', 'epoch': '0.2399', 'num_input_tokens_seen': 19507910, 'train_runtime': '9868', 'train_tokens_per_second': '1977'} +{'loss': '0.4403', 'grad_norm': '1.096', 'learning_rate': '4.99e-05', 'epoch': '0.24', 'num_input_tokens_seen': 19509957, 'train_runtime': '9869', 'train_tokens_per_second': '1977'} +{'loss': '0.4395', 'grad_norm': '1.271', 'learning_rate': '4.99e-05', 'epoch': '0.24', 'num_input_tokens_seen': 19512004, 'train_runtime': '9870', 'train_tokens_per_second': '1977'} +{'loss': '1.585', 'grad_norm': '2.421', 'learning_rate': '4.99e-05', 'epoch': '0.24', 'num_input_tokens_seen': 19514051, 'train_runtime': '9871', 'train_tokens_per_second': '1977'} +{'loss': '1.571', 'grad_norm': '1.878', 'learning_rate': '4.99e-05', 'epoch': '0.24', 'num_input_tokens_seen': 19516098, 'train_runtime': '9872', 'train_tokens_per_second': '1977'} +{'loss': '0.9503', 'grad_norm': '1.407', 'learning_rate': '4.99e-05', 'epoch': '0.2401', 'num_input_tokens_seen': 19518145, 'train_runtime': '9873', 'train_tokens_per_second': '1977'} +{'loss': '1.702', 'grad_norm': '2.619', 'learning_rate': '4.99e-05', 'epoch': '0.2401', 'num_input_tokens_seen': 19520192, 'train_runtime': '9875', 'train_tokens_per_second': '1977'} +{'loss': '1.265', 'grad_norm': '1.415', 'learning_rate': '4.99e-05', 'epoch': '0.2401', 'num_input_tokens_seen': 19522239, 'train_runtime': '9876', 'train_tokens_per_second': '1977'} +{'loss': '1.202', 'grad_norm': '2.339', 'learning_rate': '4.99e-05', 'epoch': '0.2401', 'num_input_tokens_seen': 19524286, 'train_runtime': '9877', 'train_tokens_per_second': '1977'} +{'loss': '0.6745', 'grad_norm': '1.034', 'learning_rate': '4.99e-05', 'epoch': '0.2402', 'num_input_tokens_seen': 19526333, 'train_runtime': '9878', 'train_tokens_per_second': '1977'} +{'loss': '0.7474', 'grad_norm': '1.201', 'learning_rate': '4.99e-05', 'epoch': '0.2402', 'num_input_tokens_seen': 19528380, 'train_runtime': '9879', 'train_tokens_per_second': '1977'} +{'loss': '0.7714', 'grad_norm': '1.188', 'learning_rate': '4.99e-05', 'epoch': '0.2402', 'num_input_tokens_seen': 19530427, 'train_runtime': '9880', 'train_tokens_per_second': '1977'} +{'loss': '1.195', 'grad_norm': '2.038', 'learning_rate': '4.99e-05', 'epoch': '0.2402', 'num_input_tokens_seen': 19532474, 'train_runtime': '9881', 'train_tokens_per_second': '1977'} +{'loss': '0.3018', 'grad_norm': '0.9171', 'learning_rate': '4.99e-05', 'epoch': '0.2403', 'num_input_tokens_seen': 19534521, 'train_runtime': '9882', 'train_tokens_per_second': '1977'} +{'loss': '0.7233', 'grad_norm': '0.8912', 'learning_rate': '4.99e-05', 'epoch': '0.2403', 'num_input_tokens_seen': 19536568, 'train_runtime': '9883', 'train_tokens_per_second': '1977'} +{'loss': '0.7091', 'grad_norm': '1.244', 'learning_rate': '4.99e-05', 'epoch': '0.2403', 'num_input_tokens_seen': 19538615, 'train_runtime': '9884', 'train_tokens_per_second': '1977'} +{'loss': '0.2892', 'grad_norm': '0.8972', 'learning_rate': '4.99e-05', 'epoch': '0.2404', 'num_input_tokens_seen': 19540662, 'train_runtime': '9885', 'train_tokens_per_second': '1977'} +{'loss': '1.734', 'grad_norm': '2.554', 'learning_rate': '4.99e-05', 'epoch': '0.2404', 'num_input_tokens_seen': 19542709, 'train_runtime': '9886', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '1.527', 'learning_rate': '4.99e-05', 'epoch': '0.2404', 'num_input_tokens_seen': 19544756, 'train_runtime': '9887', 'train_tokens_per_second': '1977'} +{'loss': '0.9853', 'grad_norm': '1.545', 'learning_rate': '4.99e-05', 'epoch': '0.2404', 'num_input_tokens_seen': 19546803, 'train_runtime': '9888', 'train_tokens_per_second': '1977'} +{'loss': '1.989', 'grad_norm': '2.49', 'learning_rate': '4.99e-05', 'epoch': '0.2405', 'num_input_tokens_seen': 19548850, 'train_runtime': '9889', 'train_tokens_per_second': '1977'} +{'loss': '0.639', 'grad_norm': '1.339', 'learning_rate': '4.99e-05', 'epoch': '0.2405', 'num_input_tokens_seen': 19550897, 'train_runtime': '9890', 'train_tokens_per_second': '1977'} +{'loss': '0.7652', 'grad_norm': '1.526', 'learning_rate': '4.99e-05', 'epoch': '0.2405', 'num_input_tokens_seen': 19552944, 'train_runtime': '9891', 'train_tokens_per_second': '1977'} +{'loss': '0.9368', 'grad_norm': '2.13', 'learning_rate': '4.99e-05', 'epoch': '0.2405', 'num_input_tokens_seen': 19554991, 'train_runtime': '9892', 'train_tokens_per_second': '1977'} +{'loss': '0.4322', 'grad_norm': '1.182', 'learning_rate': '4.99e-05', 'epoch': '0.2406', 'num_input_tokens_seen': 19557038, 'train_runtime': '9893', 'train_tokens_per_second': '1977'} +{'loss': '2.068', 'grad_norm': '2.278', 'learning_rate': '4.99e-05', 'epoch': '0.2406', 'num_input_tokens_seen': 19559085, 'train_runtime': '9894', 'train_tokens_per_second': '1977'} +{'loss': '0.2654', 'grad_norm': '0.8157', 'learning_rate': '4.99e-05', 'epoch': '0.2406', 'num_input_tokens_seen': 19561132, 'train_runtime': '9895', 'train_tokens_per_second': '1977'} +{'loss': '0.9633', 'grad_norm': '1.589', 'learning_rate': '4.99e-05', 'epoch': '0.2406', 'num_input_tokens_seen': 19563179, 'train_runtime': '9896', 'train_tokens_per_second': '1977'} +{'loss': '0.2911', 'grad_norm': '0.9351', 'learning_rate': '4.99e-05', 'epoch': '0.2407', 'num_input_tokens_seen': 19565226, 'train_runtime': '9897', 'train_tokens_per_second': '1977'} +{'loss': '1.221', 'grad_norm': '2.123', 'learning_rate': '4.99e-05', 'epoch': '0.2407', 'num_input_tokens_seen': 19567273, 'train_runtime': '9898', 'train_tokens_per_second': '1977'} +{'loss': '0.6625', 'grad_norm': '1.404', 'learning_rate': '4.99e-05', 'epoch': '0.2407', 'num_input_tokens_seen': 19569320, 'train_runtime': '9899', 'train_tokens_per_second': '1977'} +{'loss': '0.6067', 'grad_norm': '1.053', 'learning_rate': '4.99e-05', 'epoch': '0.2407', 'num_input_tokens_seen': 19571367, 'train_runtime': '9900', 'train_tokens_per_second': '1977'} +{'loss': '1.874', 'grad_norm': '2.591', 'learning_rate': '4.99e-05', 'epoch': '0.2408', 'num_input_tokens_seen': 19573414, 'train_runtime': '9901', 'train_tokens_per_second': '1977'} +{'loss': '0.3739', 'grad_norm': '1.075', 'learning_rate': '4.99e-05', 'epoch': '0.2408', 'num_input_tokens_seen': 19575461, 'train_runtime': '9902', 'train_tokens_per_second': '1977'} +{'loss': '1.244', 'grad_norm': '1.673', 'learning_rate': '4.99e-05', 'epoch': '0.2408', 'num_input_tokens_seen': 19577508, 'train_runtime': '9904', 'train_tokens_per_second': '1977'} +{'loss': '0.8647', 'grad_norm': '1.407', 'learning_rate': '4.99e-05', 'epoch': '0.2408', 'num_input_tokens_seen': 19579555, 'train_runtime': '9905', 'train_tokens_per_second': '1977'} +{'loss': '0.7377', 'grad_norm': '1.281', 'learning_rate': '4.99e-05', 'epoch': '0.2409', 'num_input_tokens_seen': 19581602, 'train_runtime': '9906', 'train_tokens_per_second': '1977'} +{'loss': '0.3789', 'grad_norm': '0.9597', 'learning_rate': '4.99e-05', 'epoch': '0.2409', 'num_input_tokens_seen': 19583649, 'train_runtime': '9907', 'train_tokens_per_second': '1977'} +{'loss': '2.474', 'grad_norm': '2.697', 'learning_rate': '4.99e-05', 'epoch': '0.2409', 'num_input_tokens_seen': 19585696, 'train_runtime': '9908', 'train_tokens_per_second': '1977'} +{'loss': '0.5121', 'grad_norm': '0.6919', 'learning_rate': '4.99e-05', 'epoch': '0.2409', 'num_input_tokens_seen': 19587743, 'train_runtime': '9909', 'train_tokens_per_second': '1977'} +{'loss': '0.8879', 'grad_norm': '1.352', 'learning_rate': '4.99e-05', 'epoch': '0.241', 'num_input_tokens_seen': 19589790, 'train_runtime': '9910', 'train_tokens_per_second': '1977'} +{'loss': '0.6932', 'grad_norm': '1.054', 'learning_rate': '4.99e-05', 'epoch': '0.241', 'num_input_tokens_seen': 19591837, 'train_runtime': '9911', 'train_tokens_per_second': '1977'} +{'loss': '1.291', 'grad_norm': '2.09', 'learning_rate': '4.99e-05', 'epoch': '0.241', 'num_input_tokens_seen': 19593884, 'train_runtime': '9912', 'train_tokens_per_second': '1977'} +{'loss': '0.269', 'grad_norm': '0.9666', 'learning_rate': '4.99e-05', 'epoch': '0.241', 'num_input_tokens_seen': 19595931, 'train_runtime': '9913', 'train_tokens_per_second': '1977'} +{'loss': '0.8121', 'grad_norm': '1.397', 'learning_rate': '4.99e-05', 'epoch': '0.2411', 'num_input_tokens_seen': 19597978, 'train_runtime': '9914', 'train_tokens_per_second': '1977'} +{'loss': '0.3735', 'grad_norm': '0.7221', 'learning_rate': '4.99e-05', 'epoch': '0.2411', 'num_input_tokens_seen': 19600025, 'train_runtime': '9915', 'train_tokens_per_second': '1977'} +{'loss': '0.4598', 'grad_norm': '1.167', 'learning_rate': '4.99e-05', 'epoch': '0.2411', 'num_input_tokens_seen': 19602072, 'train_runtime': '9916', 'train_tokens_per_second': '1977'} +{'loss': '1.699', 'grad_norm': '2.598', 'learning_rate': '4.99e-05', 'epoch': '0.2411', 'num_input_tokens_seen': 19604119, 'train_runtime': '9917', 'train_tokens_per_second': '1977'} +{'loss': '0.577', 'grad_norm': '1.19', 'learning_rate': '4.99e-05', 'epoch': '0.2412', 'num_input_tokens_seen': 19606166, 'train_runtime': '9918', 'train_tokens_per_second': '1977'} +{'loss': '0.3623', 'grad_norm': '0.923', 'learning_rate': '4.99e-05', 'epoch': '0.2412', 'num_input_tokens_seen': 19608213, 'train_runtime': '9919', 'train_tokens_per_second': '1977'} +{'loss': '0.2829', 'grad_norm': '0.8708', 'learning_rate': '4.99e-05', 'epoch': '0.2412', 'num_input_tokens_seen': 19610260, 'train_runtime': '9920', 'train_tokens_per_second': '1977'} +{'loss': '1.955', 'grad_norm': '2.458', 'learning_rate': '4.99e-05', 'epoch': '0.2412', 'num_input_tokens_seen': 19612307, 'train_runtime': '9921', 'train_tokens_per_second': '1977'} +{'loss': '0.8375', 'grad_norm': '1.287', 'learning_rate': '4.99e-05', 'epoch': '0.2413', 'num_input_tokens_seen': 19614354, 'train_runtime': '9922', 'train_tokens_per_second': '1977'} +{'loss': '1.666', 'grad_norm': '2.329', 'learning_rate': '4.99e-05', 'epoch': '0.2413', 'num_input_tokens_seen': 19616401, 'train_runtime': '9923', 'train_tokens_per_second': '1977'} +{'loss': '0.8213', 'grad_norm': '1.207', 'learning_rate': '4.99e-05', 'epoch': '0.2413', 'num_input_tokens_seen': 19618448, 'train_runtime': '9924', 'train_tokens_per_second': '1977'} +{'loss': '0.8219', 'grad_norm': '1.227', 'learning_rate': '4.99e-05', 'epoch': '0.2413', 'num_input_tokens_seen': 19620495, 'train_runtime': '9925', 'train_tokens_per_second': '1977'} +{'loss': '0.6063', 'grad_norm': '1.394', 'learning_rate': '4.99e-05', 'epoch': '0.2414', 'num_input_tokens_seen': 19622542, 'train_runtime': '9926', 'train_tokens_per_second': '1977'} +{'loss': '0.5509', 'grad_norm': '1.221', 'learning_rate': '4.99e-05', 'epoch': '0.2414', 'num_input_tokens_seen': 19624589, 'train_runtime': '9927', 'train_tokens_per_second': '1977'} +{'loss': '1.733', 'grad_norm': '2.073', 'learning_rate': '4.99e-05', 'epoch': '0.2414', 'num_input_tokens_seen': 19626636, 'train_runtime': '9928', 'train_tokens_per_second': '1977'} +{'loss': '0.5547', 'grad_norm': '1.092', 'learning_rate': '4.99e-05', 'epoch': '0.2414', 'num_input_tokens_seen': 19628683, 'train_runtime': '9929', 'train_tokens_per_second': '1977'} +{'loss': '0.9093', 'grad_norm': '1.367', 'learning_rate': '4.99e-05', 'epoch': '0.2415', 'num_input_tokens_seen': 19630730, 'train_runtime': '9930', 'train_tokens_per_second': '1977'} +{'loss': '1.397', 'grad_norm': '2.22', 'learning_rate': '4.99e-05', 'epoch': '0.2415', 'num_input_tokens_seen': 19632777, 'train_runtime': '9931', 'train_tokens_per_second': '1977'} +{'loss': '0.5372', 'grad_norm': '1.059', 'learning_rate': '4.99e-05', 'epoch': '0.2415', 'num_input_tokens_seen': 19634824, 'train_runtime': '9932', 'train_tokens_per_second': '1977'} +{'loss': '0.6765', 'grad_norm': '1.228', 'learning_rate': '4.99e-05', 'epoch': '0.2415', 'num_input_tokens_seen': 19636871, 'train_runtime': '9934', 'train_tokens_per_second': '1977'} +{'loss': '0.2463', 'grad_norm': '0.9881', 'learning_rate': '4.99e-05', 'epoch': '0.2416', 'num_input_tokens_seen': 19638918, 'train_runtime': '9935', 'train_tokens_per_second': '1977'} +{'loss': '1.747', 'grad_norm': '2.436', 'learning_rate': '4.99e-05', 'epoch': '0.2416', 'num_input_tokens_seen': 19640965, 'train_runtime': '9936', 'train_tokens_per_second': '1977'} +{'loss': '0.7678', 'grad_norm': '1.138', 'learning_rate': '4.99e-05', 'epoch': '0.2416', 'num_input_tokens_seen': 19643012, 'train_runtime': '9937', 'train_tokens_per_second': '1977'} +{'loss': '0.6029', 'grad_norm': '1.53', 'learning_rate': '4.99e-05', 'epoch': '0.2416', 'num_input_tokens_seen': 19645059, 'train_runtime': '9938', 'train_tokens_per_second': '1977'} +{'loss': '1.721', 'grad_norm': '2.613', 'learning_rate': '4.99e-05', 'epoch': '0.2417', 'num_input_tokens_seen': 19647106, 'train_runtime': '9939', 'train_tokens_per_second': '1977'} +{'loss': '0.3417', 'grad_norm': '0.9744', 'learning_rate': '4.99e-05', 'epoch': '0.2417', 'num_input_tokens_seen': 19649153, 'train_runtime': '9940', 'train_tokens_per_second': '1977'} +{'loss': '1.966', 'grad_norm': '2.577', 'learning_rate': '4.99e-05', 'epoch': '0.2417', 'num_input_tokens_seen': 19651200, 'train_runtime': '9941', 'train_tokens_per_second': '1977'} +{'loss': '0.792', 'grad_norm': '1.42', 'learning_rate': '4.99e-05', 'epoch': '0.2417', 'num_input_tokens_seen': 19653247, 'train_runtime': '9942', 'train_tokens_per_second': '1977'} +{'loss': '1.417', 'grad_norm': '1.995', 'learning_rate': '4.99e-05', 'epoch': '0.2418', 'num_input_tokens_seen': 19655294, 'train_runtime': '9943', 'train_tokens_per_second': '1977'} +{'loss': '0.2749', 'grad_norm': '0.9088', 'learning_rate': '4.99e-05', 'epoch': '0.2418', 'num_input_tokens_seen': 19657341, 'train_runtime': '9944', 'train_tokens_per_second': '1977'} +{'loss': '0.4962', 'grad_norm': '1.031', 'learning_rate': '4.99e-05', 'epoch': '0.2418', 'num_input_tokens_seen': 19659388, 'train_runtime': '9945', 'train_tokens_per_second': '1977'} +{'loss': '0.8362', 'grad_norm': '1.204', 'learning_rate': '4.99e-05', 'epoch': '0.2418', 'num_input_tokens_seen': 19661435, 'train_runtime': '9946', 'train_tokens_per_second': '1977'} +{'loss': '0.4429', 'grad_norm': '0.8929', 'learning_rate': '4.99e-05', 'epoch': '0.2419', 'num_input_tokens_seen': 19663482, 'train_runtime': '9947', 'train_tokens_per_second': '1977'} +{'loss': '0.5775', 'grad_norm': '1.343', 'learning_rate': '4.99e-05', 'epoch': '0.2419', 'num_input_tokens_seen': 19665529, 'train_runtime': '9948', 'train_tokens_per_second': '1977'} +{'loss': '1.688', 'grad_norm': '2.491', 'learning_rate': '4.99e-05', 'epoch': '0.2419', 'num_input_tokens_seen': 19667576, 'train_runtime': '9949', 'train_tokens_per_second': '1977'} +{'loss': '0.8497', 'grad_norm': '1.533', 'learning_rate': '4.99e-05', 'epoch': '0.2419', 'num_input_tokens_seen': 19669623, 'train_runtime': '9950', 'train_tokens_per_second': '1977'} +{'loss': '0.9553', 'grad_norm': '2.097', 'learning_rate': '4.99e-05', 'epoch': '0.242', 'num_input_tokens_seen': 19671670, 'train_runtime': '9951', 'train_tokens_per_second': '1977'} +{'loss': '0.624', 'grad_norm': '1.209', 'learning_rate': '4.99e-05', 'epoch': '0.242', 'num_input_tokens_seen': 19673717, 'train_runtime': '9952', 'train_tokens_per_second': '1977'} +{'loss': '0.4009', 'grad_norm': '1.068', 'learning_rate': '4.99e-05', 'epoch': '0.242', 'num_input_tokens_seen': 19675764, 'train_runtime': '9953', 'train_tokens_per_second': '1977'} +{'loss': '0.5735', 'grad_norm': '1.322', 'learning_rate': '4.99e-05', 'epoch': '0.242', 'num_input_tokens_seen': 19677811, 'train_runtime': '9954', 'train_tokens_per_second': '1977'} +{'loss': '0.7408', 'grad_norm': '1.254', 'learning_rate': '4.99e-05', 'epoch': '0.2421', 'num_input_tokens_seen': 19679858, 'train_runtime': '9955', 'train_tokens_per_second': '1977'} +{'loss': '0.6898', 'grad_norm': '1.158', 'learning_rate': '4.99e-05', 'epoch': '0.2421', 'num_input_tokens_seen': 19681905, 'train_runtime': '9956', 'train_tokens_per_second': '1977'} +{'loss': '0.4201', 'grad_norm': '1.207', 'learning_rate': '4.99e-05', 'epoch': '0.2421', 'num_input_tokens_seen': 19683952, 'train_runtime': '9957', 'train_tokens_per_second': '1977'} +{'loss': '1.4', 'grad_norm': '1.915', 'learning_rate': '4.99e-05', 'epoch': '0.2421', 'num_input_tokens_seen': 19685999, 'train_runtime': '9958', 'train_tokens_per_second': '1977'} +{'loss': '0.2091', 'grad_norm': '0.8492', 'learning_rate': '4.99e-05', 'epoch': '0.2422', 'num_input_tokens_seen': 19688046, 'train_runtime': '9959', 'train_tokens_per_second': '1977'} +{'loss': '1.237', 'grad_norm': '2.183', 'learning_rate': '4.99e-05', 'epoch': '0.2422', 'num_input_tokens_seen': 19690093, 'train_runtime': '9960', 'train_tokens_per_second': '1977'} +{'loss': '0.667', 'grad_norm': '1.4', 'learning_rate': '4.99e-05', 'epoch': '0.2422', 'num_input_tokens_seen': 19692140, 'train_runtime': '9961', 'train_tokens_per_second': '1977'} +{'loss': '0.5641', 'grad_norm': '1.212', 'learning_rate': '4.99e-05', 'epoch': '0.2422', 'num_input_tokens_seen': 19694187, 'train_runtime': '9962', 'train_tokens_per_second': '1977'} +{'loss': '1.495', 'grad_norm': '2.734', 'learning_rate': '4.99e-05', 'epoch': '0.2423', 'num_input_tokens_seen': 19696234, 'train_runtime': '9964', 'train_tokens_per_second': '1977'} +{'loss': '0.5514', 'grad_norm': '1.09', 'learning_rate': '4.99e-05', 'epoch': '0.2423', 'num_input_tokens_seen': 19698281, 'train_runtime': '9965', 'train_tokens_per_second': '1977'} +{'loss': '0.2683', 'grad_norm': '0.881', 'learning_rate': '4.99e-05', 'epoch': '0.2423', 'num_input_tokens_seen': 19700328, 'train_runtime': '9966', 'train_tokens_per_second': '1977'} +{'loss': '0.838', 'grad_norm': '1.31', 'learning_rate': '4.99e-05', 'epoch': '0.2423', 'num_input_tokens_seen': 19702375, 'train_runtime': '9967', 'train_tokens_per_second': '1977'} +{'loss': '0.5289', 'grad_norm': '0.8989', 'learning_rate': '4.99e-05', 'epoch': '0.2424', 'num_input_tokens_seen': 19704422, 'train_runtime': '9968', 'train_tokens_per_second': '1977'} +{'loss': '0.2724', 'grad_norm': '0.7893', 'learning_rate': '4.99e-05', 'epoch': '0.2424', 'num_input_tokens_seen': 19706469, 'train_runtime': '9969', 'train_tokens_per_second': '1977'} +{'loss': '0.5677', 'grad_norm': '1.149', 'learning_rate': '4.99e-05', 'epoch': '0.2424', 'num_input_tokens_seen': 19708516, 'train_runtime': '9970', 'train_tokens_per_second': '1977'} +{'loss': '0.9702', 'grad_norm': '1.463', 'learning_rate': '4.99e-05', 'epoch': '0.2424', 'num_input_tokens_seen': 19710563, 'train_runtime': '9971', 'train_tokens_per_second': '1977'} +{'loss': '0.2049', 'grad_norm': '0.7305', 'learning_rate': '4.99e-05', 'epoch': '0.2425', 'num_input_tokens_seen': 19712610, 'train_runtime': '9972', 'train_tokens_per_second': '1977'} +{'loss': '0.4911', 'grad_norm': '1.237', 'learning_rate': '4.99e-05', 'epoch': '0.2425', 'num_input_tokens_seen': 19714657, 'train_runtime': '9973', 'train_tokens_per_second': '1977'} +{'loss': '1.411', 'grad_norm': '2.106', 'learning_rate': '4.99e-05', 'epoch': '0.2425', 'num_input_tokens_seen': 19716704, 'train_runtime': '9974', 'train_tokens_per_second': '1977'} +{'loss': '0.3026', 'grad_norm': '0.7786', 'learning_rate': '4.99e-05', 'epoch': '0.2425', 'num_input_tokens_seen': 19718751, 'train_runtime': '9975', 'train_tokens_per_second': '1977'} +{'loss': '0.5649', 'grad_norm': '1.232', 'learning_rate': '4.99e-05', 'epoch': '0.2426', 'num_input_tokens_seen': 19720798, 'train_runtime': '9976', 'train_tokens_per_second': '1977'} +{'loss': '0.2862', 'grad_norm': '0.9296', 'learning_rate': '4.99e-05', 'epoch': '0.2426', 'num_input_tokens_seen': 19722845, 'train_runtime': '9977', 'train_tokens_per_second': '1977'} +{'loss': '1.005', 'grad_norm': '1.374', 'learning_rate': '4.99e-05', 'epoch': '0.2426', 'num_input_tokens_seen': 19724892, 'train_runtime': '9978', 'train_tokens_per_second': '1977'} +{'loss': '0.2927', 'grad_norm': '0.8184', 'learning_rate': '4.99e-05', 'epoch': '0.2426', 'num_input_tokens_seen': 19726939, 'train_runtime': '9979', 'train_tokens_per_second': '1977'} +{'loss': '0.7309', 'grad_norm': '1.308', 'learning_rate': '4.99e-05', 'epoch': '0.2427', 'num_input_tokens_seen': 19728986, 'train_runtime': '9980', 'train_tokens_per_second': '1977'} +{'loss': '0.2718', 'grad_norm': '0.7692', 'learning_rate': '4.99e-05', 'epoch': '0.2427', 'num_input_tokens_seen': 19731033, 'train_runtime': '9981', 'train_tokens_per_second': '1977'} +{'loss': '0.3306', 'grad_norm': '0.729', 'learning_rate': '4.99e-05', 'epoch': '0.2427', 'num_input_tokens_seen': 19733080, 'train_runtime': '9982', 'train_tokens_per_second': '1977'} +{'loss': '0.6187', 'grad_norm': '1.294', 'learning_rate': '4.99e-05', 'epoch': '0.2427', 'num_input_tokens_seen': 19735127, 'train_runtime': '9983', 'train_tokens_per_second': '1977'} +{'loss': '0.4667', 'grad_norm': '1.186', 'learning_rate': '4.99e-05', 'epoch': '0.2428', 'num_input_tokens_seen': 19737174, 'train_runtime': '9984', 'train_tokens_per_second': '1977'} +{'loss': '0.2752', 'grad_norm': '0.8487', 'learning_rate': '4.99e-05', 'epoch': '0.2428', 'num_input_tokens_seen': 19739221, 'train_runtime': '9985', 'train_tokens_per_second': '1977'} +{'loss': '1.135', 'grad_norm': '1.722', 'learning_rate': '4.99e-05', 'epoch': '0.2428', 'num_input_tokens_seen': 19741268, 'train_runtime': '9986', 'train_tokens_per_second': '1977'} +{'loss': '0.8992', 'grad_norm': '1.494', 'learning_rate': '4.99e-05', 'epoch': '0.2428', 'num_input_tokens_seen': 19743315, 'train_runtime': '9987', 'train_tokens_per_second': '1977'} +{'loss': '0.4912', 'grad_norm': '1.058', 'learning_rate': '4.99e-05', 'epoch': '0.2429', 'num_input_tokens_seen': 19745362, 'train_runtime': '9988', 'train_tokens_per_second': '1977'} +{'loss': '0.8591', 'grad_norm': '1.408', 'learning_rate': '4.99e-05', 'epoch': '0.2429', 'num_input_tokens_seen': 19747409, 'train_runtime': '9989', 'train_tokens_per_second': '1977'} +{'loss': '0.6287', 'grad_norm': '0.8041', 'learning_rate': '4.99e-05', 'epoch': '0.2429', 'num_input_tokens_seen': 19749456, 'train_runtime': '9990', 'train_tokens_per_second': '1977'} +{'loss': '0.856', 'grad_norm': '1.276', 'learning_rate': '4.99e-05', 'epoch': '0.2429', 'num_input_tokens_seen': 19751503, 'train_runtime': '9991', 'train_tokens_per_second': '1977'} +{'loss': '0.3979', 'grad_norm': '1.001', 'learning_rate': '4.99e-05', 'epoch': '0.243', 'num_input_tokens_seen': 19753550, 'train_runtime': '9993', 'train_tokens_per_second': '1977'} +{'loss': '0.7819', 'grad_norm': '1.405', 'learning_rate': '4.99e-05', 'epoch': '0.243', 'num_input_tokens_seen': 19755597, 'train_runtime': '9994', 'train_tokens_per_second': '1977'} +{'loss': '0.6761', 'grad_norm': '1.014', 'learning_rate': '4.99e-05', 'epoch': '0.243', 'num_input_tokens_seen': 19757644, 'train_runtime': '9995', 'train_tokens_per_second': '1977'} +{'loss': '1.287', 'grad_norm': '1.842', 'learning_rate': '4.989e-05', 'epoch': '0.243', 'num_input_tokens_seen': 19759691, 'train_runtime': '9996', 'train_tokens_per_second': '1977'} +{'loss': '0.8368', 'grad_norm': '1.056', 'learning_rate': '4.989e-05', 'epoch': '0.2431', 'num_input_tokens_seen': 19761738, 'train_runtime': '9997', 'train_tokens_per_second': '1977'} +{'loss': '1.971', 'grad_norm': '2.662', 'learning_rate': '4.989e-05', 'epoch': '0.2431', 'num_input_tokens_seen': 19763785, 'train_runtime': '9998', 'train_tokens_per_second': '1977'} +{'loss': '1.15', 'grad_norm': '2.338', 'learning_rate': '4.989e-05', 'epoch': '0.2431', 'num_input_tokens_seen': 19765832, 'train_runtime': '9999', 'train_tokens_per_second': '1977'} +{'loss': '0.3403', 'grad_norm': '1.088', 'learning_rate': '4.989e-05', 'epoch': '0.2431', 'num_input_tokens_seen': 19767879, 'train_runtime': '1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.74', 'grad_norm': '1.334', 'learning_rate': '4.989e-05', 'epoch': '0.2432', 'num_input_tokens_seen': 19769926, 'train_runtime': '1e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.364', 'grad_norm': '1.934', 'learning_rate': '4.989e-05', 'epoch': '0.2432', 'num_input_tokens_seen': 19771973, 'train_runtime': '1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3508', 'grad_norm': '0.8528', 'learning_rate': '4.989e-05', 'epoch': '0.2432', 'num_input_tokens_seen': 19774020, 'train_runtime': '1e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.207', 'grad_norm': '1.795', 'learning_rate': '4.989e-05', 'epoch': '0.2432', 'num_input_tokens_seen': 19776067, 'train_runtime': '1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2593', 'grad_norm': '0.7686', 'learning_rate': '4.989e-05', 'epoch': '0.2433', 'num_input_tokens_seen': 19778114, 'train_runtime': '1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4693', 'grad_norm': '1.278', 'learning_rate': '4.989e-05', 'epoch': '0.2433', 'num_input_tokens_seen': 19780161, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6819', 'grad_norm': '1.235', 'learning_rate': '4.989e-05', 'epoch': '0.2433', 'num_input_tokens_seen': 19782208, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.177', 'grad_norm': '1.97', 'learning_rate': '4.989e-05', 'epoch': '0.2433', 'num_input_tokens_seen': 19784255, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7599', 'grad_norm': '1.472', 'learning_rate': '4.989e-05', 'epoch': '0.2434', 'num_input_tokens_seen': 19786302, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.199', 'grad_norm': '2.005', 'learning_rate': '4.989e-05', 'epoch': '0.2434', 'num_input_tokens_seen': 19788349, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5365', 'grad_norm': '1.174', 'learning_rate': '4.989e-05', 'epoch': '0.2434', 'num_input_tokens_seen': 19790396, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4931', 'grad_norm': '1.58', 'learning_rate': '4.989e-05', 'epoch': '0.2434', 'num_input_tokens_seen': 19792443, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.268', 'grad_norm': '2.365', 'learning_rate': '4.989e-05', 'epoch': '0.2435', 'num_input_tokens_seen': 19794490, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3328', 'grad_norm': '0.8243', 'learning_rate': '4.989e-05', 'epoch': '0.2435', 'num_input_tokens_seen': 19796537, 'train_runtime': '1.001e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4454', 'grad_norm': '1.149', 'learning_rate': '4.989e-05', 'epoch': '0.2435', 'num_input_tokens_seen': 19798584, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2081', 'grad_norm': '0.7716', 'learning_rate': '4.989e-05', 'epoch': '0.2435', 'num_input_tokens_seen': 19800631, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2634', 'grad_norm': '0.8723', 'learning_rate': '4.989e-05', 'epoch': '0.2436', 'num_input_tokens_seen': 19802678, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4003', 'grad_norm': '0.9385', 'learning_rate': '4.989e-05', 'epoch': '0.2436', 'num_input_tokens_seen': 19804725, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5896', 'grad_norm': '1.346', 'learning_rate': '4.989e-05', 'epoch': '0.2436', 'num_input_tokens_seen': 19806772, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9389', 'grad_norm': '1.268', 'learning_rate': '4.989e-05', 'epoch': '0.2436', 'num_input_tokens_seen': 19808819, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.31', 'grad_norm': '0.8431', 'learning_rate': '4.989e-05', 'epoch': '0.2437', 'num_input_tokens_seen': 19810866, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1954', 'grad_norm': '0.8139', 'learning_rate': '4.989e-05', 'epoch': '0.2437', 'num_input_tokens_seen': 19812913, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9877', 'grad_norm': '1.383', 'learning_rate': '4.989e-05', 'epoch': '0.2437', 'num_input_tokens_seen': 19814960, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5147', 'grad_norm': '1.191', 'learning_rate': '4.989e-05', 'epoch': '0.2437', 'num_input_tokens_seen': 19817007, 'train_runtime': '1.002e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '1.473', 'learning_rate': '4.989e-05', 'epoch': '0.2438', 'num_input_tokens_seen': 19819054, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.13', 'grad_norm': '2.127', 'learning_rate': '4.989e-05', 'epoch': '0.2438', 'num_input_tokens_seen': 19821101, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.341', 'grad_norm': '2.282', 'learning_rate': '4.989e-05', 'epoch': '0.2438', 'num_input_tokens_seen': 19823148, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.691', 'grad_norm': '2.451', 'learning_rate': '4.989e-05', 'epoch': '0.2439', 'num_input_tokens_seen': 19825195, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.08', 'grad_norm': '1.945', 'learning_rate': '4.989e-05', 'epoch': '0.2439', 'num_input_tokens_seen': 19827242, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7423', 'grad_norm': '1.266', 'learning_rate': '4.989e-05', 'epoch': '0.2439', 'num_input_tokens_seen': 19829289, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.411', 'grad_norm': '0.9138', 'learning_rate': '4.989e-05', 'epoch': '0.2439', 'num_input_tokens_seen': 19831336, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9472', 'grad_norm': '1.701', 'learning_rate': '4.989e-05', 'epoch': '0.244', 'num_input_tokens_seen': 19833383, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3231', 'grad_norm': '0.8422', 'learning_rate': '4.989e-05', 'epoch': '0.244', 'num_input_tokens_seen': 19835430, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4991', 'grad_norm': '0.9247', 'learning_rate': '4.989e-05', 'epoch': '0.244', 'num_input_tokens_seen': 19837477, 'train_runtime': '1.003e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.585', 'grad_norm': '1.765', 'learning_rate': '4.989e-05', 'epoch': '0.244', 'num_input_tokens_seen': 19839524, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.167', 'grad_norm': '1.9', 'learning_rate': '4.989e-05', 'epoch': '0.2441', 'num_input_tokens_seen': 19841571, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4543', 'grad_norm': '0.9818', 'learning_rate': '4.989e-05', 'epoch': '0.2441', 'num_input_tokens_seen': 19843618, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.172', 'grad_norm': '2.247', 'learning_rate': '4.989e-05', 'epoch': '0.2441', 'num_input_tokens_seen': 19845665, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.03', 'grad_norm': '3.006', 'learning_rate': '4.989e-05', 'epoch': '0.2441', 'num_input_tokens_seen': 19847712, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3351', 'grad_norm': '0.9212', 'learning_rate': '4.989e-05', 'epoch': '0.2442', 'num_input_tokens_seen': 19849759, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.463', 'learning_rate': '4.989e-05', 'epoch': '0.2442', 'num_input_tokens_seen': 19851806, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.657', 'grad_norm': '2.391', 'learning_rate': '4.989e-05', 'epoch': '0.2442', 'num_input_tokens_seen': 19853853, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3228', 'grad_norm': '0.9319', 'learning_rate': '4.989e-05', 'epoch': '0.2442', 'num_input_tokens_seen': 19855900, 'train_runtime': '1.004e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.157', 'grad_norm': '1.767', 'learning_rate': '4.989e-05', 'epoch': '0.2443', 'num_input_tokens_seen': 19857947, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.023', 'grad_norm': '2.146', 'learning_rate': '4.989e-05', 'epoch': '0.2443', 'num_input_tokens_seen': 19859994, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5501', 'grad_norm': '1.268', 'learning_rate': '4.989e-05', 'epoch': '0.2443', 'num_input_tokens_seen': 19862041, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4344', 'grad_norm': '0.9838', 'learning_rate': '4.989e-05', 'epoch': '0.2443', 'num_input_tokens_seen': 19864088, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6208', 'grad_norm': '1.244', 'learning_rate': '4.989e-05', 'epoch': '0.2444', 'num_input_tokens_seen': 19866135, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.113', 'grad_norm': '1.733', 'learning_rate': '4.989e-05', 'epoch': '0.2444', 'num_input_tokens_seen': 19868182, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7542', 'grad_norm': '1.318', 'learning_rate': '4.989e-05', 'epoch': '0.2444', 'num_input_tokens_seen': 19870229, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8195', 'grad_norm': '1.393', 'learning_rate': '4.989e-05', 'epoch': '0.2444', 'num_input_tokens_seen': 19872276, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8012', 'grad_norm': '1.313', 'learning_rate': '4.989e-05', 'epoch': '0.2445', 'num_input_tokens_seen': 19874323, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.107', 'learning_rate': '4.989e-05', 'epoch': '0.2445', 'num_input_tokens_seen': 19876370, 'train_runtime': '1.005e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3885', 'grad_norm': '0.9951', 'learning_rate': '4.989e-05', 'epoch': '0.2445', 'num_input_tokens_seen': 19878417, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6183', 'grad_norm': '1.528', 'learning_rate': '4.989e-05', 'epoch': '0.2445', 'num_input_tokens_seen': 19880464, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8991', 'grad_norm': '1.352', 'learning_rate': '4.989e-05', 'epoch': '0.2446', 'num_input_tokens_seen': 19882511, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5829', 'grad_norm': '1.264', 'learning_rate': '4.989e-05', 'epoch': '0.2446', 'num_input_tokens_seen': 19884558, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '1.716', 'learning_rate': '4.989e-05', 'epoch': '0.2446', 'num_input_tokens_seen': 19886605, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.397', 'grad_norm': '2.457', 'learning_rate': '4.989e-05', 'epoch': '0.2446', 'num_input_tokens_seen': 19888652, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6752', 'grad_norm': '1.072', 'learning_rate': '4.989e-05', 'epoch': '0.2447', 'num_input_tokens_seen': 19890699, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '2.024', 'learning_rate': '4.989e-05', 'epoch': '0.2447', 'num_input_tokens_seen': 19892746, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7762', 'grad_norm': '1.392', 'learning_rate': '4.989e-05', 'epoch': '0.2447', 'num_input_tokens_seen': 19894793, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.05', 'grad_norm': '1.606', 'learning_rate': '4.989e-05', 'epoch': '0.2447', 'num_input_tokens_seen': 19896840, 'train_runtime': '1.006e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7884', 'grad_norm': '1.677', 'learning_rate': '4.989e-05', 'epoch': '0.2448', 'num_input_tokens_seen': 19898887, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3902', 'grad_norm': '0.95', 'learning_rate': '4.989e-05', 'epoch': '0.2448', 'num_input_tokens_seen': 19900934, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.696', 'grad_norm': '2.19', 'learning_rate': '4.989e-05', 'epoch': '0.2448', 'num_input_tokens_seen': 19902981, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.52', 'grad_norm': '1.994', 'learning_rate': '4.989e-05', 'epoch': '0.2448', 'num_input_tokens_seen': 19905028, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9439', 'grad_norm': '1.477', 'learning_rate': '4.989e-05', 'epoch': '0.2449', 'num_input_tokens_seen': 19907075, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5906', 'grad_norm': '1.167', 'learning_rate': '4.989e-05', 'epoch': '0.2449', 'num_input_tokens_seen': 19909122, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3831', 'grad_norm': '0.8981', 'learning_rate': '4.989e-05', 'epoch': '0.2449', 'num_input_tokens_seen': 19911169, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8001', 'grad_norm': '1.582', 'learning_rate': '4.989e-05', 'epoch': '0.2449', 'num_input_tokens_seen': 19913216, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3349', 'grad_norm': '0.8406', 'learning_rate': '4.989e-05', 'epoch': '0.245', 'num_input_tokens_seen': 19915263, 'train_runtime': '1.007e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.338', 'grad_norm': '1.057', 'learning_rate': '4.989e-05', 'epoch': '0.245', 'num_input_tokens_seen': 19917310, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3549', 'grad_norm': '0.842', 'learning_rate': '4.989e-05', 'epoch': '0.245', 'num_input_tokens_seen': 19919357, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9031', 'grad_norm': '1.68', 'learning_rate': '4.989e-05', 'epoch': '0.245', 'num_input_tokens_seen': 19921404, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3459', 'grad_norm': '0.816', 'learning_rate': '4.989e-05', 'epoch': '0.2451', 'num_input_tokens_seen': 19923451, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.395', 'grad_norm': '2.085', 'learning_rate': '4.989e-05', 'epoch': '0.2451', 'num_input_tokens_seen': 19925498, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6388', 'grad_norm': '0.9908', 'learning_rate': '4.989e-05', 'epoch': '0.2451', 'num_input_tokens_seen': 19927545, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9922', 'grad_norm': '1.408', 'learning_rate': '4.989e-05', 'epoch': '0.2451', 'num_input_tokens_seen': 19929592, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.273', 'grad_norm': '2.333', 'learning_rate': '4.989e-05', 'epoch': '0.2452', 'num_input_tokens_seen': 19931639, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.784', 'grad_norm': '2.726', 'learning_rate': '4.989e-05', 'epoch': '0.2452', 'num_input_tokens_seen': 19933686, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5263', 'grad_norm': '1.089', 'learning_rate': '4.989e-05', 'epoch': '0.2452', 'num_input_tokens_seen': 19935733, 'train_runtime': '1.008e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4317', 'grad_norm': '0.9438', 'learning_rate': '4.989e-05', 'epoch': '0.2452', 'num_input_tokens_seen': 19937780, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.34', 'grad_norm': '2.216', 'learning_rate': '4.989e-05', 'epoch': '0.2453', 'num_input_tokens_seen': 19939827, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8653', 'grad_norm': '2.339', 'learning_rate': '4.989e-05', 'epoch': '0.2453', 'num_input_tokens_seen': 19941874, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.002', 'grad_norm': '1.69', 'learning_rate': '4.989e-05', 'epoch': '0.2453', 'num_input_tokens_seen': 19943921, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6429', 'grad_norm': '1.166', 'learning_rate': '4.989e-05', 'epoch': '0.2453', 'num_input_tokens_seen': 19945968, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.034', 'grad_norm': '1.919', 'learning_rate': '4.989e-05', 'epoch': '0.2454', 'num_input_tokens_seen': 19948015, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9442', 'grad_norm': '1.659', 'learning_rate': '4.989e-05', 'epoch': '0.2454', 'num_input_tokens_seen': 19950062, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8534', 'grad_norm': '1.408', 'learning_rate': '4.989e-05', 'epoch': '0.2454', 'num_input_tokens_seen': 19952109, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8067', 'grad_norm': '1.737', 'learning_rate': '4.989e-05', 'epoch': '0.2454', 'num_input_tokens_seen': 19954156, 'train_runtime': '1.009e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3205', 'grad_norm': '1.024', 'learning_rate': '4.989e-05', 'epoch': '0.2455', 'num_input_tokens_seen': 19956203, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7072', 'grad_norm': '1.185', 'learning_rate': '4.989e-05', 'epoch': '0.2455', 'num_input_tokens_seen': 19958250, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.931', 'grad_norm': '2.47', 'learning_rate': '4.989e-05', 'epoch': '0.2455', 'num_input_tokens_seen': 19960297, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3099', 'grad_norm': '0.8088', 'learning_rate': '4.989e-05', 'epoch': '0.2455', 'num_input_tokens_seen': 19962344, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3584', 'grad_norm': '0.9428', 'learning_rate': '4.989e-05', 'epoch': '0.2456', 'num_input_tokens_seen': 19964391, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7503', 'grad_norm': '1.252', 'learning_rate': '4.989e-05', 'epoch': '0.2456', 'num_input_tokens_seen': 19966438, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3566', 'grad_norm': '0.9088', 'learning_rate': '4.989e-05', 'epoch': '0.2456', 'num_input_tokens_seen': 19968485, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3947', 'grad_norm': '1.149', 'learning_rate': '4.989e-05', 'epoch': '0.2456', 'num_input_tokens_seen': 19970532, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7801', 'grad_norm': '1.315', 'learning_rate': '4.989e-05', 'epoch': '0.2457', 'num_input_tokens_seen': 19972579, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5042', 'grad_norm': '1.232', 'learning_rate': '4.989e-05', 'epoch': '0.2457', 'num_input_tokens_seen': 19974626, 'train_runtime': '1.01e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.551', 'grad_norm': '2.442', 'learning_rate': '4.989e-05', 'epoch': '0.2457', 'num_input_tokens_seen': 19976673, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8236', 'grad_norm': '1.26', 'learning_rate': '4.989e-05', 'epoch': '0.2457', 'num_input_tokens_seen': 19978720, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8453', 'grad_norm': '1.551', 'learning_rate': '4.989e-05', 'epoch': '0.2458', 'num_input_tokens_seen': 19980767, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4327', 'grad_norm': '0.9375', 'learning_rate': '4.989e-05', 'epoch': '0.2458', 'num_input_tokens_seen': 19982814, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9177', 'grad_norm': '1.434', 'learning_rate': '4.989e-05', 'epoch': '0.2458', 'num_input_tokens_seen': 19984861, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6785', 'grad_norm': '1.562', 'learning_rate': '4.989e-05', 'epoch': '0.2458', 'num_input_tokens_seen': 19986908, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3919', 'grad_norm': '0.9265', 'learning_rate': '4.989e-05', 'epoch': '0.2459', 'num_input_tokens_seen': 19988955, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4931', 'grad_norm': '1.01', 'learning_rate': '4.989e-05', 'epoch': '0.2459', 'num_input_tokens_seen': 19991002, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.263', 'grad_norm': '2.072', 'learning_rate': '4.989e-05', 'epoch': '0.2459', 'num_input_tokens_seen': 19993049, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.365', 'grad_norm': '1.141', 'learning_rate': '4.989e-05', 'epoch': '0.2459', 'num_input_tokens_seen': 19995096, 'train_runtime': '1.011e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.722', 'grad_norm': '2.247', 'learning_rate': '4.989e-05', 'epoch': '0.246', 'num_input_tokens_seen': 19997143, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6514', 'grad_norm': '1.026', 'learning_rate': '4.989e-05', 'epoch': '0.246', 'num_input_tokens_seen': 19999190, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.974', 'grad_norm': '2.084', 'learning_rate': '4.989e-05', 'epoch': '0.246', 'num_input_tokens_seen': 20001237, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5765', 'grad_norm': '1.319', 'learning_rate': '4.989e-05', 'epoch': '0.246', 'num_input_tokens_seen': 20003284, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.588', 'grad_norm': '2.092', 'learning_rate': '4.989e-05', 'epoch': '0.2461', 'num_input_tokens_seen': 20005331, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8511', 'grad_norm': '1.418', 'learning_rate': '4.989e-05', 'epoch': '0.2461', 'num_input_tokens_seen': 20007378, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5928', 'grad_norm': '1.516', 'learning_rate': '4.989e-05', 'epoch': '0.2461', 'num_input_tokens_seen': 20009425, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2731', 'grad_norm': '0.8735', 'learning_rate': '4.989e-05', 'epoch': '0.2461', 'num_input_tokens_seen': 20011472, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.423', 'grad_norm': '2.094', 'learning_rate': '4.989e-05', 'epoch': '0.2462', 'num_input_tokens_seen': 20013519, 'train_runtime': '1.012e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4728', 'grad_norm': '0.9521', 'learning_rate': '4.989e-05', 'epoch': '0.2462', 'num_input_tokens_seen': 20015566, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6009', 'grad_norm': '0.8576', 'learning_rate': '4.989e-05', 'epoch': '0.2462', 'num_input_tokens_seen': 20017613, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '2.122', 'learning_rate': '4.989e-05', 'epoch': '0.2462', 'num_input_tokens_seen': 20019660, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.234', 'grad_norm': '2.079', 'learning_rate': '4.989e-05', 'epoch': '0.2463', 'num_input_tokens_seen': 20021707, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4134', 'grad_norm': '0.9182', 'learning_rate': '4.989e-05', 'epoch': '0.2463', 'num_input_tokens_seen': 20023754, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8378', 'grad_norm': '1.549', 'learning_rate': '4.989e-05', 'epoch': '0.2463', 'num_input_tokens_seen': 20025801, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6074', 'grad_norm': '1.599', 'learning_rate': '4.989e-05', 'epoch': '0.2463', 'num_input_tokens_seen': 20027848, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7747', 'grad_norm': '1.155', 'learning_rate': '4.989e-05', 'epoch': '0.2464', 'num_input_tokens_seen': 20029895, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.06', 'grad_norm': '2.193', 'learning_rate': '4.989e-05', 'epoch': '0.2464', 'num_input_tokens_seen': 20031942, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.02', 'grad_norm': '2.987', 'learning_rate': '4.989e-05', 'epoch': '0.2464', 'num_input_tokens_seen': 20033989, 'train_runtime': '1.013e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.199', 'grad_norm': '2.685', 'learning_rate': '4.989e-05', 'epoch': '0.2464', 'num_input_tokens_seen': 20036036, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2554', 'grad_norm': '0.8429', 'learning_rate': '4.989e-05', 'epoch': '0.2465', 'num_input_tokens_seen': 20038083, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7344', 'grad_norm': '1.343', 'learning_rate': '4.989e-05', 'epoch': '0.2465', 'num_input_tokens_seen': 20040130, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5146', 'grad_norm': '1.014', 'learning_rate': '4.989e-05', 'epoch': '0.2465', 'num_input_tokens_seen': 20042177, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.75', 'grad_norm': '2.983', 'learning_rate': '4.989e-05', 'epoch': '0.2465', 'num_input_tokens_seen': 20044224, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.824', 'grad_norm': '2.554', 'learning_rate': '4.989e-05', 'epoch': '0.2466', 'num_input_tokens_seen': 20046271, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5697', 'grad_norm': '1.443', 'learning_rate': '4.989e-05', 'epoch': '0.2466', 'num_input_tokens_seen': 20048318, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.548', 'grad_norm': '2.51', 'learning_rate': '4.989e-05', 'epoch': '0.2466', 'num_input_tokens_seen': 20050365, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6328', 'grad_norm': '1.235', 'learning_rate': '4.989e-05', 'epoch': '0.2466', 'num_input_tokens_seen': 20052412, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4505', 'grad_norm': '1.001', 'learning_rate': '4.989e-05', 'epoch': '0.2467', 'num_input_tokens_seen': 20054459, 'train_runtime': '1.014e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.437', 'grad_norm': '1.859', 'learning_rate': '4.989e-05', 'epoch': '0.2467', 'num_input_tokens_seen': 20056506, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.86', 'learning_rate': '4.989e-05', 'epoch': '0.2467', 'num_input_tokens_seen': 20058553, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8954', 'grad_norm': '1.407', 'learning_rate': '4.989e-05', 'epoch': '0.2467', 'num_input_tokens_seen': 20060600, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.283', 'grad_norm': '1.93', 'learning_rate': '4.989e-05', 'epoch': '0.2468', 'num_input_tokens_seen': 20062647, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7612', 'grad_norm': '1.55', 'learning_rate': '4.989e-05', 'epoch': '0.2468', 'num_input_tokens_seen': 20064694, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9316', 'grad_norm': '1.517', 'learning_rate': '4.989e-05', 'epoch': '0.2468', 'num_input_tokens_seen': 20066741, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8434', 'grad_norm': '1.953', 'learning_rate': '4.989e-05', 'epoch': '0.2468', 'num_input_tokens_seen': 20068788, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.518', 'grad_norm': '1.984', 'learning_rate': '4.989e-05', 'epoch': '0.2469', 'num_input_tokens_seen': 20070835, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.164', 'grad_norm': '1.993', 'learning_rate': '4.989e-05', 'epoch': '0.2469', 'num_input_tokens_seen': 20072882, 'train_runtime': '1.015e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.21', 'grad_norm': '2.078', 'learning_rate': '4.989e-05', 'epoch': '0.2469', 'num_input_tokens_seen': 20074929, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.225', 'grad_norm': '2.211', 'learning_rate': '4.989e-05', 'epoch': '0.2469', 'num_input_tokens_seen': 20076976, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5678', 'grad_norm': '1.572', 'learning_rate': '4.989e-05', 'epoch': '0.247', 'num_input_tokens_seen': 20079023, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2875', 'grad_norm': '0.9569', 'learning_rate': '4.989e-05', 'epoch': '0.247', 'num_input_tokens_seen': 20081070, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4831', 'grad_norm': '1.042', 'learning_rate': '4.989e-05', 'epoch': '0.247', 'num_input_tokens_seen': 20083117, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4167', 'grad_norm': '0.9152', 'learning_rate': '4.989e-05', 'epoch': '0.247', 'num_input_tokens_seen': 20085164, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8005', 'grad_norm': '1.184', 'learning_rate': '4.989e-05', 'epoch': '0.2471', 'num_input_tokens_seen': 20087211, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.487', 'grad_norm': '1.449', 'learning_rate': '4.989e-05', 'epoch': '0.2471', 'num_input_tokens_seen': 20089258, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3469', 'grad_norm': '0.7753', 'learning_rate': '4.989e-05', 'epoch': '0.2471', 'num_input_tokens_seen': 20091305, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.635', 'grad_norm': '2.183', 'learning_rate': '4.989e-05', 'epoch': '0.2471', 'num_input_tokens_seen': 20093352, 'train_runtime': '1.016e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5544', 'grad_norm': '1.567', 'learning_rate': '4.989e-05', 'epoch': '0.2472', 'num_input_tokens_seen': 20095399, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9954', 'grad_norm': '1.667', 'learning_rate': '4.989e-05', 'epoch': '0.2472', 'num_input_tokens_seen': 20097446, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3852', 'grad_norm': '0.946', 'learning_rate': '4.989e-05', 'epoch': '0.2472', 'num_input_tokens_seen': 20099493, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.668', 'grad_norm': '2.162', 'learning_rate': '4.989e-05', 'epoch': '0.2472', 'num_input_tokens_seen': 20101540, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7872', 'grad_norm': '1.098', 'learning_rate': '4.989e-05', 'epoch': '0.2473', 'num_input_tokens_seen': 20103587, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8619', 'grad_norm': '1.119', 'learning_rate': '4.989e-05', 'epoch': '0.2473', 'num_input_tokens_seen': 20105634, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.455', 'grad_norm': '1.085', 'learning_rate': '4.989e-05', 'epoch': '0.2473', 'num_input_tokens_seen': 20107681, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4205', 'grad_norm': '0.9476', 'learning_rate': '4.989e-05', 'epoch': '0.2474', 'num_input_tokens_seen': 20109728, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.282', 'grad_norm': '1.888', 'learning_rate': '4.989e-05', 'epoch': '0.2474', 'num_input_tokens_seen': 20111775, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6332', 'grad_norm': '0.8665', 'learning_rate': '4.989e-05', 'epoch': '0.2474', 'num_input_tokens_seen': 20113822, 'train_runtime': '1.017e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8729', 'grad_norm': '1.282', 'learning_rate': '4.989e-05', 'epoch': '0.2474', 'num_input_tokens_seen': 20115869, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6508', 'grad_norm': '1.356', 'learning_rate': '4.989e-05', 'epoch': '0.2475', 'num_input_tokens_seen': 20117916, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3422', 'grad_norm': '0.9051', 'learning_rate': '4.989e-05', 'epoch': '0.2475', 'num_input_tokens_seen': 20119963, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.033', 'grad_norm': '1.785', 'learning_rate': '4.989e-05', 'epoch': '0.2475', 'num_input_tokens_seen': 20122010, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.166', 'grad_norm': '1.929', 'learning_rate': '4.989e-05', 'epoch': '0.2475', 'num_input_tokens_seen': 20124057, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5979', 'grad_norm': '1.168', 'learning_rate': '4.989e-05', 'epoch': '0.2476', 'num_input_tokens_seen': 20126104, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.254', 'grad_norm': '0.8088', 'learning_rate': '4.989e-05', 'epoch': '0.2476', 'num_input_tokens_seen': 20128151, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5854', 'grad_norm': '1.45', 'learning_rate': '4.989e-05', 'epoch': '0.2476', 'num_input_tokens_seen': 20130198, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4726', 'grad_norm': '1.035', 'learning_rate': '4.989e-05', 'epoch': '0.2476', 'num_input_tokens_seen': 20132245, 'train_runtime': '1.018e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3381', 'grad_norm': '1.013', 'learning_rate': '4.989e-05', 'epoch': '0.2477', 'num_input_tokens_seen': 20134292, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9426', 'grad_norm': '1.42', 'learning_rate': '4.989e-05', 'epoch': '0.2477', 'num_input_tokens_seen': 20136339, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3185', 'grad_norm': '0.8944', 'learning_rate': '4.989e-05', 'epoch': '0.2477', 'num_input_tokens_seen': 20138386, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.319', 'grad_norm': '0.7609', 'learning_rate': '4.989e-05', 'epoch': '0.2477', 'num_input_tokens_seen': 20140433, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6747', 'grad_norm': '1.543', 'learning_rate': '4.989e-05', 'epoch': '0.2478', 'num_input_tokens_seen': 20142480, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.689', 'grad_norm': '1.223', 'learning_rate': '4.989e-05', 'epoch': '0.2478', 'num_input_tokens_seen': 20144527, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3269', 'grad_norm': '1.129', 'learning_rate': '4.989e-05', 'epoch': '0.2478', 'num_input_tokens_seen': 20146574, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5068', 'grad_norm': '1.239', 'learning_rate': '4.989e-05', 'epoch': '0.2478', 'num_input_tokens_seen': 20148621, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.428', 'grad_norm': '1.066', 'learning_rate': '4.989e-05', 'epoch': '0.2479', 'num_input_tokens_seen': 20150668, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5138', 'grad_norm': '1.253', 'learning_rate': '4.989e-05', 'epoch': '0.2479', 'num_input_tokens_seen': 20152715, 'train_runtime': '1.019e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7009', 'grad_norm': '1.731', 'learning_rate': '4.989e-05', 'epoch': '0.2479', 'num_input_tokens_seen': 20154762, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.438', 'grad_norm': '1.096', 'learning_rate': '4.989e-05', 'epoch': '0.2479', 'num_input_tokens_seen': 20156809, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.331', 'grad_norm': '3.051', 'learning_rate': '4.989e-05', 'epoch': '0.248', 'num_input_tokens_seen': 20158856, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2948', 'grad_norm': '0.7766', 'learning_rate': '4.989e-05', 'epoch': '0.248', 'num_input_tokens_seen': 20160903, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3243', 'grad_norm': '0.8598', 'learning_rate': '4.989e-05', 'epoch': '0.248', 'num_input_tokens_seen': 20162950, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4699', 'grad_norm': '1.284', 'learning_rate': '4.989e-05', 'epoch': '0.248', 'num_input_tokens_seen': 20164997, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3136', 'grad_norm': '1.13', 'learning_rate': '4.989e-05', 'epoch': '0.2481', 'num_input_tokens_seen': 20167044, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.092', 'grad_norm': '1.848', 'learning_rate': '4.989e-05', 'epoch': '0.2481', 'num_input_tokens_seen': 20169091, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8819', 'grad_norm': '1.371', 'learning_rate': '4.989e-05', 'epoch': '0.2481', 'num_input_tokens_seen': 20171138, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3139', 'grad_norm': '1.019', 'learning_rate': '4.989e-05', 'epoch': '0.2481', 'num_input_tokens_seen': 20173185, 'train_runtime': '1.02e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6805', 'grad_norm': '1.285', 'learning_rate': '4.989e-05', 'epoch': '0.2482', 'num_input_tokens_seen': 20175232, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.197', 'grad_norm': '1.923', 'learning_rate': '4.989e-05', 'epoch': '0.2482', 'num_input_tokens_seen': 20177279, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.506', 'grad_norm': '2.544', 'learning_rate': '4.989e-05', 'epoch': '0.2482', 'num_input_tokens_seen': 20179326, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.228', 'grad_norm': '2.1', 'learning_rate': '4.989e-05', 'epoch': '0.2482', 'num_input_tokens_seen': 20181373, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8646', 'grad_norm': '1.9', 'learning_rate': '4.989e-05', 'epoch': '0.2483', 'num_input_tokens_seen': 20183420, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9576', 'grad_norm': '1.326', 'learning_rate': '4.989e-05', 'epoch': '0.2483', 'num_input_tokens_seen': 20185467, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4497', 'grad_norm': '1.097', 'learning_rate': '4.989e-05', 'epoch': '0.2483', 'num_input_tokens_seen': 20187514, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4489', 'grad_norm': '1.126', 'learning_rate': '4.989e-05', 'epoch': '0.2483', 'num_input_tokens_seen': 20189561, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.229', 'grad_norm': '2.599', 'learning_rate': '4.989e-05', 'epoch': '0.2484', 'num_input_tokens_seen': 20191608, 'train_runtime': '1.021e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6874', 'grad_norm': '1.202', 'learning_rate': '4.989e-05', 'epoch': '0.2484', 'num_input_tokens_seen': 20193655, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2773', 'grad_norm': '0.8998', 'learning_rate': '4.989e-05', 'epoch': '0.2484', 'num_input_tokens_seen': 20195702, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7328', 'grad_norm': '1.686', 'learning_rate': '4.989e-05', 'epoch': '0.2484', 'num_input_tokens_seen': 20197749, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4748', 'grad_norm': '1.002', 'learning_rate': '4.989e-05', 'epoch': '0.2485', 'num_input_tokens_seen': 20199796, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7697', 'grad_norm': '1.217', 'learning_rate': '4.989e-05', 'epoch': '0.2485', 'num_input_tokens_seen': 20201843, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8583', 'grad_norm': '1.453', 'learning_rate': '4.989e-05', 'epoch': '0.2485', 'num_input_tokens_seen': 20203890, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8729', 'grad_norm': '1.376', 'learning_rate': '4.989e-05', 'epoch': '0.2485', 'num_input_tokens_seen': 20205937, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.506', 'grad_norm': '2.72', 'learning_rate': '4.989e-05', 'epoch': '0.2486', 'num_input_tokens_seen': 20207984, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5008', 'grad_norm': '0.9911', 'learning_rate': '4.989e-05', 'epoch': '0.2486', 'num_input_tokens_seen': 20210031, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.07', 'grad_norm': '2.094', 'learning_rate': '4.989e-05', 'epoch': '0.2486', 'num_input_tokens_seen': 20212078, 'train_runtime': '1.022e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5088', 'grad_norm': '1.08', 'learning_rate': '4.989e-05', 'epoch': '0.2486', 'num_input_tokens_seen': 20214125, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8499', 'grad_norm': '1.268', 'learning_rate': '4.989e-05', 'epoch': '0.2487', 'num_input_tokens_seen': 20216172, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4343', 'grad_norm': '1.041', 'learning_rate': '4.989e-05', 'epoch': '0.2487', 'num_input_tokens_seen': 20218219, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3474', 'grad_norm': '0.8205', 'learning_rate': '4.989e-05', 'epoch': '0.2487', 'num_input_tokens_seen': 20220266, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4703', 'grad_norm': '1.155', 'learning_rate': '4.989e-05', 'epoch': '0.2487', 'num_input_tokens_seen': 20222313, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.718', 'grad_norm': '1.292', 'learning_rate': '4.989e-05', 'epoch': '0.2488', 'num_input_tokens_seen': 20224360, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.819', 'grad_norm': '1.307', 'learning_rate': '4.989e-05', 'epoch': '0.2488', 'num_input_tokens_seen': 20226407, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7906', 'grad_norm': '1.154', 'learning_rate': '4.989e-05', 'epoch': '0.2488', 'num_input_tokens_seen': 20228454, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7969', 'grad_norm': '1.156', 'learning_rate': '4.989e-05', 'epoch': '0.2488', 'num_input_tokens_seen': 20230501, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7383', 'grad_norm': '1.189', 'learning_rate': '4.989e-05', 'epoch': '0.2489', 'num_input_tokens_seen': 20232548, 'train_runtime': '1.023e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9436', 'grad_norm': '1.721', 'learning_rate': '4.989e-05', 'epoch': '0.2489', 'num_input_tokens_seen': 20234595, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2945', 'grad_norm': '0.9775', 'learning_rate': '4.989e-05', 'epoch': '0.2489', 'num_input_tokens_seen': 20236642, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8733', 'grad_norm': '1.389', 'learning_rate': '4.989e-05', 'epoch': '0.2489', 'num_input_tokens_seen': 20238689, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8082', 'grad_norm': '1.229', 'learning_rate': '4.989e-05', 'epoch': '0.249', 'num_input_tokens_seen': 20240736, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3391', 'grad_norm': '0.9221', 'learning_rate': '4.989e-05', 'epoch': '0.249', 'num_input_tokens_seen': 20242783, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8524', 'grad_norm': '1.479', 'learning_rate': '4.989e-05', 'epoch': '0.249', 'num_input_tokens_seen': 20244830, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7012', 'grad_norm': '1.043', 'learning_rate': '4.989e-05', 'epoch': '0.249', 'num_input_tokens_seen': 20246877, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2686', 'grad_norm': '0.7774', 'learning_rate': '4.989e-05', 'epoch': '0.2491', 'num_input_tokens_seen': 20248924, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6192', 'grad_norm': '0.9502', 'learning_rate': '4.989e-05', 'epoch': '0.2491', 'num_input_tokens_seen': 20250971, 'train_runtime': '1.024e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.849', 'grad_norm': '2.288', 'learning_rate': '4.989e-05', 'epoch': '0.2491', 'num_input_tokens_seen': 20253018, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4553', 'grad_norm': '1.073', 'learning_rate': '4.989e-05', 'epoch': '0.2491', 'num_input_tokens_seen': 20255065, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4416', 'grad_norm': '1.264', 'learning_rate': '4.989e-05', 'epoch': '0.2492', 'num_input_tokens_seen': 20257112, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4055', 'grad_norm': '0.9408', 'learning_rate': '4.989e-05', 'epoch': '0.2492', 'num_input_tokens_seen': 20259159, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2685', 'grad_norm': '0.8312', 'learning_rate': '4.989e-05', 'epoch': '0.2492', 'num_input_tokens_seen': 20261206, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3687', 'grad_norm': '0.9507', 'learning_rate': '4.989e-05', 'epoch': '0.2492', 'num_input_tokens_seen': 20263253, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8542', 'grad_norm': '1.449', 'learning_rate': '4.989e-05', 'epoch': '0.2493', 'num_input_tokens_seen': 20265300, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.182', 'grad_norm': '1.859', 'learning_rate': '4.989e-05', 'epoch': '0.2493', 'num_input_tokens_seen': 20267347, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.663', 'grad_norm': '1.251', 'learning_rate': '4.989e-05', 'epoch': '0.2493', 'num_input_tokens_seen': 20269394, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.081', 'grad_norm': '2.044', 'learning_rate': '4.989e-05', 'epoch': '0.2493', 'num_input_tokens_seen': 20271441, 'train_runtime': '1.025e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3447', 'grad_norm': '0.9209', 'learning_rate': '4.989e-05', 'epoch': '0.2494', 'num_input_tokens_seen': 20273488, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9756', 'grad_norm': '1.778', 'learning_rate': '4.989e-05', 'epoch': '0.2494', 'num_input_tokens_seen': 20275535, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3405', 'grad_norm': '0.9365', 'learning_rate': '4.989e-05', 'epoch': '0.2494', 'num_input_tokens_seen': 20277582, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2371', 'grad_norm': '0.8783', 'learning_rate': '4.989e-05', 'epoch': '0.2494', 'num_input_tokens_seen': 20279629, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.81', 'grad_norm': '2.505', 'learning_rate': '4.989e-05', 'epoch': '0.2495', 'num_input_tokens_seen': 20281676, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3015', 'grad_norm': '0.8084', 'learning_rate': '4.989e-05', 'epoch': '0.2495', 'num_input_tokens_seen': 20283723, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7586', 'grad_norm': '1.175', 'learning_rate': '4.989e-05', 'epoch': '0.2495', 'num_input_tokens_seen': 20285770, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2905', 'grad_norm': '0.8834', 'learning_rate': '4.989e-05', 'epoch': '0.2495', 'num_input_tokens_seen': 20287817, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4775', 'grad_norm': '1.007', 'learning_rate': '4.989e-05', 'epoch': '0.2496', 'num_input_tokens_seen': 20289864, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8893', 'grad_norm': '1.623', 'learning_rate': '4.989e-05', 'epoch': '0.2496', 'num_input_tokens_seen': 20291911, 'train_runtime': '1.026e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4168', 'grad_norm': '0.8905', 'learning_rate': '4.989e-05', 'epoch': '0.2496', 'num_input_tokens_seen': 20293958, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6011', 'grad_norm': '1.494', 'learning_rate': '4.989e-05', 'epoch': '0.2496', 'num_input_tokens_seen': 20296005, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7964', 'grad_norm': '1.519', 'learning_rate': '4.989e-05', 'epoch': '0.2497', 'num_input_tokens_seen': 20298052, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2104', 'grad_norm': '0.775', 'learning_rate': '4.989e-05', 'epoch': '0.2497', 'num_input_tokens_seen': 20300099, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9183', 'grad_norm': '1.362', 'learning_rate': '4.988e-05', 'epoch': '0.2497', 'num_input_tokens_seen': 20302146, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.441', 'grad_norm': '2.072', 'learning_rate': '4.988e-05', 'epoch': '0.2497', 'num_input_tokens_seen': 20304193, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3304', 'grad_norm': '0.7963', 'learning_rate': '4.988e-05', 'epoch': '0.2498', 'num_input_tokens_seen': 20306240, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4767', 'grad_norm': '1.125', 'learning_rate': '4.988e-05', 'epoch': '0.2498', 'num_input_tokens_seen': 20308287, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.156', 'grad_norm': '1.711', 'learning_rate': '4.988e-05', 'epoch': '0.2498', 'num_input_tokens_seen': 20310334, 'train_runtime': '1.027e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4496', 'grad_norm': '0.9177', 'learning_rate': '4.988e-05', 'epoch': '0.2498', 'num_input_tokens_seen': 20312381, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2514', 'grad_norm': '0.9689', 'learning_rate': '4.988e-05', 'epoch': '0.2499', 'num_input_tokens_seen': 20314428, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.649', 'grad_norm': '2.461', 'learning_rate': '4.988e-05', 'epoch': '0.2499', 'num_input_tokens_seen': 20316475, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6616', 'grad_norm': '1.45', 'learning_rate': '4.988e-05', 'epoch': '0.2499', 'num_input_tokens_seen': 20318522, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2324', 'grad_norm': '0.915', 'learning_rate': '4.988e-05', 'epoch': '0.2499', 'num_input_tokens_seen': 20320569, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.265', 'grad_norm': '0.9408', 'learning_rate': '4.988e-05', 'epoch': '0.25', 'num_input_tokens_seen': 20322616, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.087', 'grad_norm': '2.103', 'learning_rate': '4.988e-05', 'epoch': '0.25', 'num_input_tokens_seen': 20324663, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8472', 'grad_norm': '1.26', 'learning_rate': '4.988e-05', 'epoch': '0.25', 'num_input_tokens_seen': 20326710, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7554', 'grad_norm': '1.641', 'learning_rate': '4.988e-05', 'epoch': '0.25', 'num_input_tokens_seen': 20328757, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3891', 'grad_norm': '1.008', 'learning_rate': '4.988e-05', 'epoch': '0.2501', 'num_input_tokens_seen': 20330804, 'train_runtime': '1.028e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3444', 'grad_norm': '0.8793', 'learning_rate': '4.988e-05', 'epoch': '0.2501', 'num_input_tokens_seen': 20332851, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9177', 'grad_norm': '1.336', 'learning_rate': '4.988e-05', 'epoch': '0.2501', 'num_input_tokens_seen': 20334898, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5607', 'grad_norm': '1.204', 'learning_rate': '4.988e-05', 'epoch': '0.2501', 'num_input_tokens_seen': 20336945, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5649', 'grad_norm': '1.306', 'learning_rate': '4.988e-05', 'epoch': '0.2502', 'num_input_tokens_seen': 20338992, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2411', 'grad_norm': '1.019', 'learning_rate': '4.988e-05', 'epoch': '0.2502', 'num_input_tokens_seen': 20341039, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6742', 'grad_norm': '1.001', 'learning_rate': '4.988e-05', 'epoch': '0.2502', 'num_input_tokens_seen': 20343086, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4221', 'grad_norm': '1.129', 'learning_rate': '4.988e-05', 'epoch': '0.2502', 'num_input_tokens_seen': 20345133, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8361', 'grad_norm': '1.395', 'learning_rate': '4.988e-05', 'epoch': '0.2503', 'num_input_tokens_seen': 20347180, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5241', 'grad_norm': '1.225', 'learning_rate': '4.988e-05', 'epoch': '0.2503', 'num_input_tokens_seen': 20349227, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2123', 'grad_norm': '0.7851', 'learning_rate': '4.988e-05', 'epoch': '0.2503', 'num_input_tokens_seen': 20351274, 'train_runtime': '1.029e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.05', 'grad_norm': '1.342', 'learning_rate': '4.988e-05', 'epoch': '0.2503', 'num_input_tokens_seen': 20353321, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5404', 'grad_norm': '1.374', 'learning_rate': '4.988e-05', 'epoch': '0.2504', 'num_input_tokens_seen': 20355368, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4147', 'grad_norm': '0.9517', 'learning_rate': '4.988e-05', 'epoch': '0.2504', 'num_input_tokens_seen': 20357415, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8301', 'grad_norm': '0.9839', 'learning_rate': '4.988e-05', 'epoch': '0.2504', 'num_input_tokens_seen': 20359462, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.432', 'grad_norm': '2.118', 'learning_rate': '4.988e-05', 'epoch': '0.2504', 'num_input_tokens_seen': 20361509, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7474', 'grad_norm': '1.103', 'learning_rate': '4.988e-05', 'epoch': '0.2505', 'num_input_tokens_seen': 20363556, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3523', 'grad_norm': '1.046', 'learning_rate': '4.988e-05', 'epoch': '0.2505', 'num_input_tokens_seen': 20365603, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7137', 'grad_norm': '1.449', 'learning_rate': '4.988e-05', 'epoch': '0.2505', 'num_input_tokens_seen': 20367650, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9251', 'grad_norm': '1.517', 'learning_rate': '4.988e-05', 'epoch': '0.2505', 'num_input_tokens_seen': 20369697, 'train_runtime': '1.03e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7758', 'grad_norm': '1.017', 'learning_rate': '4.988e-05', 'epoch': '0.2506', 'num_input_tokens_seen': 20371744, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3721', 'grad_norm': '1.158', 'learning_rate': '4.988e-05', 'epoch': '0.2506', 'num_input_tokens_seen': 20373791, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5562', 'grad_norm': '0.9106', 'learning_rate': '4.988e-05', 'epoch': '0.2506', 'num_input_tokens_seen': 20375838, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6869', 'grad_norm': '1.144', 'learning_rate': '4.988e-05', 'epoch': '0.2506', 'num_input_tokens_seen': 20377885, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.82', 'grad_norm': '1.136', 'learning_rate': '4.988e-05', 'epoch': '0.2507', 'num_input_tokens_seen': 20379932, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9047', 'grad_norm': '1.812', 'learning_rate': '4.988e-05', 'epoch': '0.2507', 'num_input_tokens_seen': 20381979, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3307', 'grad_norm': '0.8693', 'learning_rate': '4.988e-05', 'epoch': '0.2507', 'num_input_tokens_seen': 20384026, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2508', 'grad_norm': '0.8463', 'learning_rate': '4.988e-05', 'epoch': '0.2507', 'num_input_tokens_seen': 20386073, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.344', 'grad_norm': '2.2', 'learning_rate': '4.988e-05', 'epoch': '0.2508', 'num_input_tokens_seen': 20388120, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3029', 'grad_norm': '0.846', 'learning_rate': '4.988e-05', 'epoch': '0.2508', 'num_input_tokens_seen': 20390167, 'train_runtime': '1.031e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3942', 'grad_norm': '0.9858', 'learning_rate': '4.988e-05', 'epoch': '0.2508', 'num_input_tokens_seen': 20392214, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5052', 'grad_norm': '1.024', 'learning_rate': '4.988e-05', 'epoch': '0.2508', 'num_input_tokens_seen': 20394261, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.316', 'grad_norm': '0.788', 'learning_rate': '4.988e-05', 'epoch': '0.2509', 'num_input_tokens_seen': 20396308, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3455', 'grad_norm': '0.9019', 'learning_rate': '4.988e-05', 'epoch': '0.2509', 'num_input_tokens_seen': 20398355, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.911', 'grad_norm': '1.308', 'learning_rate': '4.988e-05', 'epoch': '0.2509', 'num_input_tokens_seen': 20400402, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4953', 'grad_norm': '1.089', 'learning_rate': '4.988e-05', 'epoch': '0.251', 'num_input_tokens_seen': 20402449, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.553', 'grad_norm': '2.434', 'learning_rate': '4.988e-05', 'epoch': '0.251', 'num_input_tokens_seen': 20404496, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.315', 'grad_norm': '0.9033', 'learning_rate': '4.988e-05', 'epoch': '0.251', 'num_input_tokens_seen': 20406543, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.237', 'grad_norm': '2.524', 'learning_rate': '4.988e-05', 'epoch': '0.251', 'num_input_tokens_seen': 20408590, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2895', 'grad_norm': '0.8955', 'learning_rate': '4.988e-05', 'epoch': '0.2511', 'num_input_tokens_seen': 20410637, 'train_runtime': '1.032e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4092', 'grad_norm': '1.105', 'learning_rate': '4.988e-05', 'epoch': '0.2511', 'num_input_tokens_seen': 20412684, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6199', 'grad_norm': '1.314', 'learning_rate': '4.988e-05', 'epoch': '0.2511', 'num_input_tokens_seen': 20414731, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.653', 'grad_norm': '2.794', 'learning_rate': '4.988e-05', 'epoch': '0.2511', 'num_input_tokens_seen': 20416778, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3261', 'grad_norm': '0.7833', 'learning_rate': '4.988e-05', 'epoch': '0.2512', 'num_input_tokens_seen': 20418825, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2641', 'grad_norm': '1.007', 'learning_rate': '4.988e-05', 'epoch': '0.2512', 'num_input_tokens_seen': 20420872, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7744', 'grad_norm': '1.729', 'learning_rate': '4.988e-05', 'epoch': '0.2512', 'num_input_tokens_seen': 20422919, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6389', 'grad_norm': '1.445', 'learning_rate': '4.988e-05', 'epoch': '0.2512', 'num_input_tokens_seen': 20424966, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.939', 'grad_norm': '3.408', 'learning_rate': '4.988e-05', 'epoch': '0.2513', 'num_input_tokens_seen': 20427013, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7617', 'grad_norm': '1.258', 'learning_rate': '4.988e-05', 'epoch': '0.2513', 'num_input_tokens_seen': 20429060, 'train_runtime': '1.033e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7228', 'grad_norm': '1.159', 'learning_rate': '4.988e-05', 'epoch': '0.2513', 'num_input_tokens_seen': 20431107, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.392', 'grad_norm': '0.9078', 'learning_rate': '4.988e-05', 'epoch': '0.2513', 'num_input_tokens_seen': 20433154, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3983', 'grad_norm': '0.8285', 'learning_rate': '4.988e-05', 'epoch': '0.2514', 'num_input_tokens_seen': 20435201, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6095', 'grad_norm': '1.121', 'learning_rate': '4.988e-05', 'epoch': '0.2514', 'num_input_tokens_seen': 20437248, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2713', 'grad_norm': '0.821', 'learning_rate': '4.988e-05', 'epoch': '0.2514', 'num_input_tokens_seen': 20439295, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7361', 'grad_norm': '1.423', 'learning_rate': '4.988e-05', 'epoch': '0.2514', 'num_input_tokens_seen': 20441342, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.723', 'grad_norm': '2.937', 'learning_rate': '4.988e-05', 'epoch': '0.2515', 'num_input_tokens_seen': 20443389, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.424', 'grad_norm': '2.417', 'learning_rate': '4.988e-05', 'epoch': '0.2515', 'num_input_tokens_seen': 20445436, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.666', 'grad_norm': '2.416', 'learning_rate': '4.988e-05', 'epoch': '0.2515', 'num_input_tokens_seen': 20447483, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5252', 'grad_norm': '1.267', 'learning_rate': '4.988e-05', 'epoch': '0.2515', 'num_input_tokens_seen': 20449530, 'train_runtime': '1.034e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8038', 'grad_norm': '1.342', 'learning_rate': '4.988e-05', 'epoch': '0.2516', 'num_input_tokens_seen': 20451577, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6565', 'grad_norm': '1.4', 'learning_rate': '4.988e-05', 'epoch': '0.2516', 'num_input_tokens_seen': 20453624, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5077', 'grad_norm': '0.9842', 'learning_rate': '4.988e-05', 'epoch': '0.2516', 'num_input_tokens_seen': 20455671, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4867', 'grad_norm': '1.19', 'learning_rate': '4.988e-05', 'epoch': '0.2516', 'num_input_tokens_seen': 20457718, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.564', 'grad_norm': '2.668', 'learning_rate': '4.988e-05', 'epoch': '0.2517', 'num_input_tokens_seen': 20459765, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5091', 'grad_norm': '1.242', 'learning_rate': '4.988e-05', 'epoch': '0.2517', 'num_input_tokens_seen': 20461812, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.733', 'grad_norm': '2.308', 'learning_rate': '4.988e-05', 'epoch': '0.2517', 'num_input_tokens_seen': 20463859, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.235', 'grad_norm': '1.888', 'learning_rate': '4.988e-05', 'epoch': '0.2517', 'num_input_tokens_seen': 20465906, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7533', 'grad_norm': '1.363', 'learning_rate': '4.988e-05', 'epoch': '0.2518', 'num_input_tokens_seen': 20467953, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5787', 'grad_norm': '1.229', 'learning_rate': '4.988e-05', 'epoch': '0.2518', 'num_input_tokens_seen': 20470000, 'train_runtime': '1.035e+04', 'train_tokens_per_second': '1977'} +[INFO|configuration_utils.py:665] 2026-02-05 05:30:00,331 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 05:30:00,331 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 05:30:00,853 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-10000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 05:30:00,860 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-10000/tokenizer_config.json + +{'loss': '0.9681', 'grad_norm': '1.929', 'learning_rate': '4.988e-05', 'epoch': '0.2518', 'num_input_tokens_seen': 20472047, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4677', 'grad_norm': '1.242', 'learning_rate': '4.988e-05', 'epoch': '0.2518', 'num_input_tokens_seen': 20474094, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.155', 'grad_norm': '1.662', 'learning_rate': '4.988e-05', 'epoch': '0.2519', 'num_input_tokens_seen': 20476141, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4306', 'grad_norm': '1.038', 'learning_rate': '4.988e-05', 'epoch': '0.2519', 'num_input_tokens_seen': 20478188, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4791', 'grad_norm': '1.077', 'learning_rate': '4.988e-05', 'epoch': '0.2519', 'num_input_tokens_seen': 20480235, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5922', 'grad_norm': '1.167', 'learning_rate': '4.988e-05', 'epoch': '0.2519', 'num_input_tokens_seen': 20482282, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3542', 'grad_norm': '1.07', 'learning_rate': '4.988e-05', 'epoch': '0.252', 'num_input_tokens_seen': 20484329, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8975', 'grad_norm': '1.418', 'learning_rate': '4.988e-05', 'epoch': '0.252', 'num_input_tokens_seen': 20486376, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8826', 'grad_norm': '1.516', 'learning_rate': '4.988e-05', 'epoch': '0.252', 'num_input_tokens_seen': 20488423, 'train_runtime': '1.036e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6022', 'grad_norm': '1.281', 'learning_rate': '4.988e-05', 'epoch': '0.252', 'num_input_tokens_seen': 20490470, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4508', 'grad_norm': '0.9531', 'learning_rate': '4.988e-05', 'epoch': '0.2521', 'num_input_tokens_seen': 20492517, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3507', 'grad_norm': '0.9124', 'learning_rate': '4.988e-05', 'epoch': '0.2521', 'num_input_tokens_seen': 20494564, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3567', 'grad_norm': '0.7479', 'learning_rate': '4.988e-05', 'epoch': '0.2521', 'num_input_tokens_seen': 20496611, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4071', 'grad_norm': '1.039', 'learning_rate': '4.988e-05', 'epoch': '0.2521', 'num_input_tokens_seen': 20498658, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2664', 'grad_norm': '0.8132', 'learning_rate': '4.988e-05', 'epoch': '0.2522', 'num_input_tokens_seen': 20500705, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3884', 'grad_norm': '0.9542', 'learning_rate': '4.988e-05', 'epoch': '0.2522', 'num_input_tokens_seen': 20502752, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7034', 'grad_norm': '1.103', 'learning_rate': '4.988e-05', 'epoch': '0.2522', 'num_input_tokens_seen': 20504799, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7603', 'grad_norm': '1.799', 'learning_rate': '4.988e-05', 'epoch': '0.2522', 'num_input_tokens_seen': 20506846, 'train_runtime': '1.037e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6518', 'grad_norm': '1.146', 'learning_rate': '4.988e-05', 'epoch': '0.2523', 'num_input_tokens_seen': 20508893, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.302', 'grad_norm': '2.102', 'learning_rate': '4.988e-05', 'epoch': '0.2523', 'num_input_tokens_seen': 20510940, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.374', 'grad_norm': '2.738', 'learning_rate': '4.988e-05', 'epoch': '0.2523', 'num_input_tokens_seen': 20512987, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9688', 'grad_norm': '2.01', 'learning_rate': '4.988e-05', 'epoch': '0.2523', 'num_input_tokens_seen': 20515034, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '1.306', 'learning_rate': '4.988e-05', 'epoch': '0.2524', 'num_input_tokens_seen': 20517081, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4444', 'grad_norm': '1.007', 'learning_rate': '4.988e-05', 'epoch': '0.2524', 'num_input_tokens_seen': 20519128, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4599', 'grad_norm': '1.25', 'learning_rate': '4.988e-05', 'epoch': '0.2524', 'num_input_tokens_seen': 20521175, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.198', 'grad_norm': '1.821', 'learning_rate': '4.988e-05', 'epoch': '0.2524', 'num_input_tokens_seen': 20523222, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9403', 'grad_norm': '1.887', 'learning_rate': '4.988e-05', 'epoch': '0.2525', 'num_input_tokens_seen': 20525269, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8618', 'grad_norm': '1.634', 'learning_rate': '4.988e-05', 'epoch': '0.2525', 'num_input_tokens_seen': 20527316, 'train_runtime': '1.038e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.843', 'grad_norm': '1.775', 'learning_rate': '4.988e-05', 'epoch': '0.2525', 'num_input_tokens_seen': 20529363, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2344', 'grad_norm': '0.9195', 'learning_rate': '4.988e-05', 'epoch': '0.2525', 'num_input_tokens_seen': 20531410, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2996', 'grad_norm': '0.8447', 'learning_rate': '4.988e-05', 'epoch': '0.2526', 'num_input_tokens_seen': 20533457, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3226', 'grad_norm': '1.011', 'learning_rate': '4.988e-05', 'epoch': '0.2526', 'num_input_tokens_seen': 20535504, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.305', 'grad_norm': '1.081', 'learning_rate': '4.988e-05', 'epoch': '0.2526', 'num_input_tokens_seen': 20537551, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2353', 'grad_norm': '0.8492', 'learning_rate': '4.988e-05', 'epoch': '0.2526', 'num_input_tokens_seen': 20539598, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.232', 'grad_norm': '1.143', 'learning_rate': '4.988e-05', 'epoch': '0.2527', 'num_input_tokens_seen': 20541645, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3174', 'grad_norm': '0.9031', 'learning_rate': '4.988e-05', 'epoch': '0.2527', 'num_input_tokens_seen': 20543692, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4507', 'grad_norm': '0.8811', 'learning_rate': '4.988e-05', 'epoch': '0.2527', 'num_input_tokens_seen': 20545739, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8613', 'grad_norm': '1.762', 'learning_rate': '4.988e-05', 'epoch': '0.2527', 'num_input_tokens_seen': 20547786, 'train_runtime': '1.039e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8811', 'grad_norm': '1.71', 'learning_rate': '4.988e-05', 'epoch': '0.2528', 'num_input_tokens_seen': 20549833, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.546', 'grad_norm': '2.271', 'learning_rate': '4.988e-05', 'epoch': '0.2528', 'num_input_tokens_seen': 20551880, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2619', 'grad_norm': '0.8766', 'learning_rate': '4.988e-05', 'epoch': '0.2528', 'num_input_tokens_seen': 20553927, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4999', 'grad_norm': '0.9942', 'learning_rate': '4.988e-05', 'epoch': '0.2528', 'num_input_tokens_seen': 20555974, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.844', 'grad_norm': '2.735', 'learning_rate': '4.988e-05', 'epoch': '0.2529', 'num_input_tokens_seen': 20558021, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7648', 'grad_norm': '1.642', 'learning_rate': '4.988e-05', 'epoch': '0.2529', 'num_input_tokens_seen': 20560068, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5085', 'grad_norm': '1.163', 'learning_rate': '4.988e-05', 'epoch': '0.2529', 'num_input_tokens_seen': 20562115, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2721', 'grad_norm': '0.9526', 'learning_rate': '4.988e-05', 'epoch': '0.2529', 'num_input_tokens_seen': 20564162, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.93', 'grad_norm': '2.375', 'learning_rate': '4.988e-05', 'epoch': '0.253', 'num_input_tokens_seen': 20566209, 'train_runtime': '1.04e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7362', 'grad_norm': '1.274', 'learning_rate': '4.988e-05', 'epoch': '0.253', 'num_input_tokens_seen': 20568256, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.413', 'grad_norm': '2.283', 'learning_rate': '4.988e-05', 'epoch': '0.253', 'num_input_tokens_seen': 20570303, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.981', 'grad_norm': '2.502', 'learning_rate': '4.988e-05', 'epoch': '0.253', 'num_input_tokens_seen': 20572350, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2328', 'grad_norm': '1.053', 'learning_rate': '4.988e-05', 'epoch': '0.2531', 'num_input_tokens_seen': 20574397, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.124', 'grad_norm': '2.294', 'learning_rate': '4.988e-05', 'epoch': '0.2531', 'num_input_tokens_seen': 20576444, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5589', 'grad_norm': '1.282', 'learning_rate': '4.988e-05', 'epoch': '0.2531', 'num_input_tokens_seen': 20578491, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5097', 'grad_norm': '1.328', 'learning_rate': '4.988e-05', 'epoch': '0.2531', 'num_input_tokens_seen': 20580538, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3013', 'grad_norm': '0.9658', 'learning_rate': '4.988e-05', 'epoch': '0.2532', 'num_input_tokens_seen': 20582585, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.309', 'grad_norm': '2.365', 'learning_rate': '4.988e-05', 'epoch': '0.2532', 'num_input_tokens_seen': 20584632, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.456', 'grad_norm': '2.799', 'learning_rate': '4.988e-05', 'epoch': '0.2532', 'num_input_tokens_seen': 20586679, 'train_runtime': '1.041e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.151', 'grad_norm': '1.941', 'learning_rate': '4.988e-05', 'epoch': '0.2532', 'num_input_tokens_seen': 20588726, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7256', 'grad_norm': '1.114', 'learning_rate': '4.988e-05', 'epoch': '0.2533', 'num_input_tokens_seen': 20590773, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9155', 'grad_norm': '1.997', 'learning_rate': '4.988e-05', 'epoch': '0.2533', 'num_input_tokens_seen': 20592820, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7688', 'grad_norm': '1.073', 'learning_rate': '4.988e-05', 'epoch': '0.2533', 'num_input_tokens_seen': 20594867, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3903', 'grad_norm': '1.023', 'learning_rate': '4.988e-05', 'epoch': '0.2533', 'num_input_tokens_seen': 20596914, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2536', 'grad_norm': '0.7774', 'learning_rate': '4.988e-05', 'epoch': '0.2534', 'num_input_tokens_seen': 20598961, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7389', 'grad_norm': '1.322', 'learning_rate': '4.988e-05', 'epoch': '0.2534', 'num_input_tokens_seen': 20601008, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.785', 'grad_norm': '2.811', 'learning_rate': '4.988e-05', 'epoch': '0.2534', 'num_input_tokens_seen': 20603055, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6914', 'grad_norm': '1.324', 'learning_rate': '4.988e-05', 'epoch': '0.2534', 'num_input_tokens_seen': 20605102, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.512', 'grad_norm': '3.364', 'learning_rate': '4.988e-05', 'epoch': '0.2535', 'num_input_tokens_seen': 20607149, 'train_runtime': '1.042e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7811', 'grad_norm': '1.036', 'learning_rate': '4.988e-05', 'epoch': '0.2535', 'num_input_tokens_seen': 20609196, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4387', 'grad_norm': '1.115', 'learning_rate': '4.988e-05', 'epoch': '0.2535', 'num_input_tokens_seen': 20611243, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6326', 'grad_norm': '1.242', 'learning_rate': '4.988e-05', 'epoch': '0.2535', 'num_input_tokens_seen': 20613290, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3025', 'grad_norm': '0.8363', 'learning_rate': '4.988e-05', 'epoch': '0.2536', 'num_input_tokens_seen': 20615337, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3285', 'grad_norm': '0.933', 'learning_rate': '4.988e-05', 'epoch': '0.2536', 'num_input_tokens_seen': 20617384, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6868', 'grad_norm': '1.611', 'learning_rate': '4.988e-05', 'epoch': '0.2536', 'num_input_tokens_seen': 20619431, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.535', 'grad_norm': '1.078', 'learning_rate': '4.988e-05', 'epoch': '0.2536', 'num_input_tokens_seen': 20621478, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8758', 'grad_norm': '1.848', 'learning_rate': '4.988e-05', 'epoch': '0.2537', 'num_input_tokens_seen': 20623525, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5329', 'grad_norm': '1.281', 'learning_rate': '4.988e-05', 'epoch': '0.2537', 'num_input_tokens_seen': 20625572, 'train_runtime': '1.043e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4087', 'grad_norm': '1.118', 'learning_rate': '4.988e-05', 'epoch': '0.2537', 'num_input_tokens_seen': 20627619, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.284', 'grad_norm': '2.364', 'learning_rate': '4.988e-05', 'epoch': '0.2537', 'num_input_tokens_seen': 20629666, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3413', 'grad_norm': '1.054', 'learning_rate': '4.988e-05', 'epoch': '0.2538', 'num_input_tokens_seen': 20631713, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3003', 'grad_norm': '0.8788', 'learning_rate': '4.988e-05', 'epoch': '0.2538', 'num_input_tokens_seen': 20633760, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5007', 'grad_norm': '1.248', 'learning_rate': '4.988e-05', 'epoch': '0.2538', 'num_input_tokens_seen': 20635807, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3928', 'grad_norm': '0.8581', 'learning_rate': '4.988e-05', 'epoch': '0.2538', 'num_input_tokens_seen': 20637854, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7921', 'grad_norm': '1.777', 'learning_rate': '4.988e-05', 'epoch': '0.2539', 'num_input_tokens_seen': 20639901, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.827', 'grad_norm': '2.742', 'learning_rate': '4.988e-05', 'epoch': '0.2539', 'num_input_tokens_seen': 20641948, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6922', 'grad_norm': '1.473', 'learning_rate': '4.988e-05', 'epoch': '0.2539', 'num_input_tokens_seen': 20643995, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4619', 'grad_norm': '0.9373', 'learning_rate': '4.988e-05', 'epoch': '0.2539', 'num_input_tokens_seen': 20646042, 'train_runtime': '1.044e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2727', 'grad_norm': '1.006', 'learning_rate': '4.988e-05', 'epoch': '0.254', 'num_input_tokens_seen': 20648089, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.477', 'grad_norm': '1.19', 'learning_rate': '4.988e-05', 'epoch': '0.254', 'num_input_tokens_seen': 20650136, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2915', 'grad_norm': '0.864', 'learning_rate': '4.988e-05', 'epoch': '0.254', 'num_input_tokens_seen': 20652183, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.496', 'grad_norm': '2.293', 'learning_rate': '4.988e-05', 'epoch': '0.254', 'num_input_tokens_seen': 20654230, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.253', 'grad_norm': '2.48', 'learning_rate': '4.988e-05', 'epoch': '0.2541', 'num_input_tokens_seen': 20656277, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3867', 'grad_norm': '0.899', 'learning_rate': '4.988e-05', 'epoch': '0.2541', 'num_input_tokens_seen': 20658324, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5212', 'grad_norm': '1.093', 'learning_rate': '4.988e-05', 'epoch': '0.2541', 'num_input_tokens_seen': 20660371, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4176', 'grad_norm': '0.8944', 'learning_rate': '4.988e-05', 'epoch': '0.2541', 'num_input_tokens_seen': 20662418, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6706', 'grad_norm': '1.488', 'learning_rate': '4.988e-05', 'epoch': '0.2542', 'num_input_tokens_seen': 20664465, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.316', 'grad_norm': '0.8427', 'learning_rate': '4.988e-05', 'epoch': '0.2542', 'num_input_tokens_seen': 20666512, 'train_runtime': '1.045e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.754', 'grad_norm': '2.813', 'learning_rate': '4.988e-05', 'epoch': '0.2542', 'num_input_tokens_seen': 20668559, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3412', 'grad_norm': '1.062', 'learning_rate': '4.988e-05', 'epoch': '0.2542', 'num_input_tokens_seen': 20670606, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9617', 'grad_norm': '1.83', 'learning_rate': '4.988e-05', 'epoch': '0.2543', 'num_input_tokens_seen': 20672653, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.35', 'grad_norm': '2.57', 'learning_rate': '4.988e-05', 'epoch': '0.2543', 'num_input_tokens_seen': 20674700, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.185', 'grad_norm': '1.399', 'learning_rate': '4.988e-05', 'epoch': '0.2543', 'num_input_tokens_seen': 20676747, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5927', 'grad_norm': '1.609', 'learning_rate': '4.988e-05', 'epoch': '0.2543', 'num_input_tokens_seen': 20678794, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4079', 'grad_norm': '1.069', 'learning_rate': '4.988e-05', 'epoch': '0.2544', 'num_input_tokens_seen': 20680841, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3558', 'grad_norm': '0.9561', 'learning_rate': '4.988e-05', 'epoch': '0.2544', 'num_input_tokens_seen': 20682888, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5235', 'grad_norm': '1.108', 'learning_rate': '4.988e-05', 'epoch': '0.2544', 'num_input_tokens_seen': 20684935, 'train_runtime': '1.046e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6778', 'grad_norm': '1.435', 'learning_rate': '4.988e-05', 'epoch': '0.2545', 'num_input_tokens_seen': 20686982, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.202', 'grad_norm': '1.909', 'learning_rate': '4.988e-05', 'epoch': '0.2545', 'num_input_tokens_seen': 20689029, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7815', 'grad_norm': '1.638', 'learning_rate': '4.988e-05', 'epoch': '0.2545', 'num_input_tokens_seen': 20691076, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.293', 'grad_norm': '0.6988', 'learning_rate': '4.988e-05', 'epoch': '0.2545', 'num_input_tokens_seen': 20693123, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8799', 'grad_norm': '1.331', 'learning_rate': '4.988e-05', 'epoch': '0.2546', 'num_input_tokens_seen': 20695170, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2144', 'grad_norm': '0.784', 'learning_rate': '4.988e-05', 'epoch': '0.2546', 'num_input_tokens_seen': 20697217, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6876', 'grad_norm': '1.37', 'learning_rate': '4.988e-05', 'epoch': '0.2546', 'num_input_tokens_seen': 20699264, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5393', 'grad_norm': '1.063', 'learning_rate': '4.988e-05', 'epoch': '0.2546', 'num_input_tokens_seen': 20701311, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.323', 'grad_norm': '2.039', 'learning_rate': '4.988e-05', 'epoch': '0.2547', 'num_input_tokens_seen': 20703358, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.662', 'grad_norm': '1.396', 'learning_rate': '4.988e-05', 'epoch': '0.2547', 'num_input_tokens_seen': 20705405, 'train_runtime': '1.047e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8716', 'grad_norm': '1.217', 'learning_rate': '4.988e-05', 'epoch': '0.2547', 'num_input_tokens_seen': 20707452, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3816', 'grad_norm': '0.9115', 'learning_rate': '4.988e-05', 'epoch': '0.2547', 'num_input_tokens_seen': 20709499, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3123', 'grad_norm': '0.9511', 'learning_rate': '4.988e-05', 'epoch': '0.2548', 'num_input_tokens_seen': 20711546, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.414', 'grad_norm': '1.068', 'learning_rate': '4.988e-05', 'epoch': '0.2548', 'num_input_tokens_seen': 20713593, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2564', 'grad_norm': '0.855', 'learning_rate': '4.988e-05', 'epoch': '0.2548', 'num_input_tokens_seen': 20715640, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4171', 'grad_norm': '1.05', 'learning_rate': '4.988e-05', 'epoch': '0.2548', 'num_input_tokens_seen': 20717687, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.053', 'grad_norm': '1.639', 'learning_rate': '4.988e-05', 'epoch': '0.2549', 'num_input_tokens_seen': 20719734, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3675', 'grad_norm': '0.7423', 'learning_rate': '4.988e-05', 'epoch': '0.2549', 'num_input_tokens_seen': 20721781, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3894', 'grad_norm': '0.8426', 'learning_rate': '4.988e-05', 'epoch': '0.2549', 'num_input_tokens_seen': 20723828, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.337', 'grad_norm': '2.598', 'learning_rate': '4.988e-05', 'epoch': '0.2549', 'num_input_tokens_seen': 20725875, 'train_runtime': '1.048e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7573', 'grad_norm': '1.962', 'learning_rate': '4.988e-05', 'epoch': '0.255', 'num_input_tokens_seen': 20727922, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4177', 'grad_norm': '0.9563', 'learning_rate': '4.988e-05', 'epoch': '0.255', 'num_input_tokens_seen': 20729969, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7123', 'grad_norm': '1.173', 'learning_rate': '4.988e-05', 'epoch': '0.255', 'num_input_tokens_seen': 20732016, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.006', 'grad_norm': '1.605', 'learning_rate': '4.988e-05', 'epoch': '0.255', 'num_input_tokens_seen': 20734063, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8236', 'grad_norm': '1.441', 'learning_rate': '4.988e-05', 'epoch': '0.2551', 'num_input_tokens_seen': 20736110, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.575', 'grad_norm': '2.401', 'learning_rate': '4.988e-05', 'epoch': '0.2551', 'num_input_tokens_seen': 20738157, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9463', 'grad_norm': '1.482', 'learning_rate': '4.988e-05', 'epoch': '0.2551', 'num_input_tokens_seen': 20740204, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5609', 'grad_norm': '1.12', 'learning_rate': '4.988e-05', 'epoch': '0.2551', 'num_input_tokens_seen': 20742251, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4728', 'grad_norm': '1.328', 'learning_rate': '4.988e-05', 'epoch': '0.2552', 'num_input_tokens_seen': 20744298, 'train_runtime': '1.049e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3668', 'grad_norm': '0.8778', 'learning_rate': '4.988e-05', 'epoch': '0.2552', 'num_input_tokens_seen': 20746345, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.524', 'grad_norm': '2.316', 'learning_rate': '4.988e-05', 'epoch': '0.2552', 'num_input_tokens_seen': 20748392, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6891', 'grad_norm': '1.099', 'learning_rate': '4.988e-05', 'epoch': '0.2552', 'num_input_tokens_seen': 20750439, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.114', 'grad_norm': '1.61', 'learning_rate': '4.988e-05', 'epoch': '0.2553', 'num_input_tokens_seen': 20752486, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8006', 'grad_norm': '1.532', 'learning_rate': '4.988e-05', 'epoch': '0.2553', 'num_input_tokens_seen': 20754533, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4958', 'grad_norm': '1.32', 'learning_rate': '4.988e-05', 'epoch': '0.2553', 'num_input_tokens_seen': 20756580, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2748', 'grad_norm': '1.03', 'learning_rate': '4.988e-05', 'epoch': '0.2553', 'num_input_tokens_seen': 20758627, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4209', 'grad_norm': '1.06', 'learning_rate': '4.988e-05', 'epoch': '0.2554', 'num_input_tokens_seen': 20760674, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9902', 'grad_norm': '1.187', 'learning_rate': '4.988e-05', 'epoch': '0.2554', 'num_input_tokens_seen': 20762721, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.09', 'grad_norm': '2.184', 'learning_rate': '4.988e-05', 'epoch': '0.2554', 'num_input_tokens_seen': 20764768, 'train_runtime': '1.05e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.242', 'grad_norm': '2.632', 'learning_rate': '4.988e-05', 'epoch': '0.2554', 'num_input_tokens_seen': 20766815, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3869', 'grad_norm': '0.8405', 'learning_rate': '4.988e-05', 'epoch': '0.2555', 'num_input_tokens_seen': 20768862, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6585', 'grad_norm': '1.388', 'learning_rate': '4.988e-05', 'epoch': '0.2555', 'num_input_tokens_seen': 20770909, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5855', 'grad_norm': '1.351', 'learning_rate': '4.988e-05', 'epoch': '0.2555', 'num_input_tokens_seen': 20772956, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8097', 'grad_norm': '1.443', 'learning_rate': '4.988e-05', 'epoch': '0.2555', 'num_input_tokens_seen': 20775003, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.516', 'grad_norm': '2.117', 'learning_rate': '4.988e-05', 'epoch': '0.2556', 'num_input_tokens_seen': 20777050, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7423', 'grad_norm': '1.373', 'learning_rate': '4.988e-05', 'epoch': '0.2556', 'num_input_tokens_seen': 20779097, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7225', 'grad_norm': '1.092', 'learning_rate': '4.988e-05', 'epoch': '0.2556', 'num_input_tokens_seen': 20781144, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '1', 'grad_norm': '2.059', 'learning_rate': '4.988e-05', 'epoch': '0.2556', 'num_input_tokens_seen': 20783191, 'train_runtime': '1.051e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.653', 'grad_norm': '1.503', 'learning_rate': '4.988e-05', 'epoch': '0.2557', 'num_input_tokens_seen': 20785238, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8009', 'grad_norm': '1.156', 'learning_rate': '4.988e-05', 'epoch': '0.2557', 'num_input_tokens_seen': 20787285, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6381', 'grad_norm': '1.511', 'learning_rate': '4.988e-05', 'epoch': '0.2557', 'num_input_tokens_seen': 20789332, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.647', 'grad_norm': '1.273', 'learning_rate': '4.988e-05', 'epoch': '0.2557', 'num_input_tokens_seen': 20791379, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6471', 'grad_norm': '1.217', 'learning_rate': '4.988e-05', 'epoch': '0.2558', 'num_input_tokens_seen': 20793426, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5481', 'grad_norm': '1.255', 'learning_rate': '4.988e-05', 'epoch': '0.2558', 'num_input_tokens_seen': 20795473, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3168', 'grad_norm': '1.145', 'learning_rate': '4.988e-05', 'epoch': '0.2558', 'num_input_tokens_seen': 20797520, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7445', 'grad_norm': '1.229', 'learning_rate': '4.988e-05', 'epoch': '0.2558', 'num_input_tokens_seen': 20799567, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9879', 'grad_norm': '1.487', 'learning_rate': '4.988e-05', 'epoch': '0.2559', 'num_input_tokens_seen': 20801614, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.778', 'grad_norm': '2.571', 'learning_rate': '4.988e-05', 'epoch': '0.2559', 'num_input_tokens_seen': 20803661, 'train_runtime': '1.052e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3336', 'grad_norm': '1.023', 'learning_rate': '4.988e-05', 'epoch': '0.2559', 'num_input_tokens_seen': 20805708, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4968', 'grad_norm': '1.036', 'learning_rate': '4.988e-05', 'epoch': '0.2559', 'num_input_tokens_seen': 20807755, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7963', 'grad_norm': '1.618', 'learning_rate': '4.988e-05', 'epoch': '0.256', 'num_input_tokens_seen': 20809802, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7307', 'grad_norm': '1.301', 'learning_rate': '4.988e-05', 'epoch': '0.256', 'num_input_tokens_seen': 20811849, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6342', 'grad_norm': '1.431', 'learning_rate': '4.988e-05', 'epoch': '0.256', 'num_input_tokens_seen': 20813896, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.573', 'grad_norm': '1.291', 'learning_rate': '4.988e-05', 'epoch': '0.256', 'num_input_tokens_seen': 20815943, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4821', 'grad_norm': '0.9326', 'learning_rate': '4.988e-05', 'epoch': '0.2561', 'num_input_tokens_seen': 20817990, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9698', 'grad_norm': '1.299', 'learning_rate': '4.987e-05', 'epoch': '0.2561', 'num_input_tokens_seen': 20820037, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6324', 'grad_norm': '1.112', 'learning_rate': '4.987e-05', 'epoch': '0.2561', 'num_input_tokens_seen': 20822084, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7731', 'grad_norm': '1.339', 'learning_rate': '4.987e-05', 'epoch': '0.2561', 'num_input_tokens_seen': 20824131, 'train_runtime': '1.053e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.146', 'grad_norm': '1.864', 'learning_rate': '4.987e-05', 'epoch': '0.2562', 'num_input_tokens_seen': 20826178, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9935', 'grad_norm': '1.314', 'learning_rate': '4.987e-05', 'epoch': '0.2562', 'num_input_tokens_seen': 20828225, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.392', 'grad_norm': '1.935', 'learning_rate': '4.987e-05', 'epoch': '0.2562', 'num_input_tokens_seen': 20830272, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4953', 'grad_norm': '1.194', 'learning_rate': '4.987e-05', 'epoch': '0.2562', 'num_input_tokens_seen': 20832319, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2857', 'grad_norm': '0.6868', 'learning_rate': '4.987e-05', 'epoch': '0.2563', 'num_input_tokens_seen': 20834366, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9442', 'grad_norm': '1.443', 'learning_rate': '4.987e-05', 'epoch': '0.2563', 'num_input_tokens_seen': 20836413, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7229', 'grad_norm': '1.281', 'learning_rate': '4.987e-05', 'epoch': '0.2563', 'num_input_tokens_seen': 20838460, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2876', 'grad_norm': '0.9094', 'learning_rate': '4.987e-05', 'epoch': '0.2563', 'num_input_tokens_seen': 20840507, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2983', 'grad_norm': '0.9724', 'learning_rate': '4.987e-05', 'epoch': '0.2564', 'num_input_tokens_seen': 20842554, 'train_runtime': '1.054e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5589', 'grad_norm': '1.4', 'learning_rate': '4.987e-05', 'epoch': '0.2564', 'num_input_tokens_seen': 20844601, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3553', 'grad_norm': '1.039', 'learning_rate': '4.987e-05', 'epoch': '0.2564', 'num_input_tokens_seen': 20846648, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.007', 'grad_norm': '1.725', 'learning_rate': '4.987e-05', 'epoch': '0.2564', 'num_input_tokens_seen': 20848695, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3218', 'grad_norm': '0.8748', 'learning_rate': '4.987e-05', 'epoch': '0.2565', 'num_input_tokens_seen': 20850742, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.116', 'grad_norm': '2.099', 'learning_rate': '4.987e-05', 'epoch': '0.2565', 'num_input_tokens_seen': 20852789, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6201', 'grad_norm': '1.598', 'learning_rate': '4.987e-05', 'epoch': '0.2565', 'num_input_tokens_seen': 20854836, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5134', 'grad_norm': '1.385', 'learning_rate': '4.987e-05', 'epoch': '0.2565', 'num_input_tokens_seen': 20856883, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.84', 'grad_norm': '3.189', 'learning_rate': '4.987e-05', 'epoch': '0.2566', 'num_input_tokens_seen': 20858930, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2316', 'grad_norm': '0.8832', 'learning_rate': '4.987e-05', 'epoch': '0.2566', 'num_input_tokens_seen': 20860977, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.128', 'grad_norm': '1.421', 'learning_rate': '4.987e-05', 'epoch': '0.2566', 'num_input_tokens_seen': 20863024, 'train_runtime': '1.055e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8271', 'grad_norm': '1.199', 'learning_rate': '4.987e-05', 'epoch': '0.2566', 'num_input_tokens_seen': 20865071, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.967', 'grad_norm': '2.98', 'learning_rate': '4.987e-05', 'epoch': '0.2567', 'num_input_tokens_seen': 20867118, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.88', 'grad_norm': '1.283', 'learning_rate': '4.987e-05', 'epoch': '0.2567', 'num_input_tokens_seen': 20869165, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6299', 'grad_norm': '0.8538', 'learning_rate': '4.987e-05', 'epoch': '0.2567', 'num_input_tokens_seen': 20871212, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3418', 'grad_norm': '0.7827', 'learning_rate': '4.987e-05', 'epoch': '0.2567', 'num_input_tokens_seen': 20873259, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9448', 'grad_norm': '1.352', 'learning_rate': '4.987e-05', 'epoch': '0.2568', 'num_input_tokens_seen': 20875306, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9307', 'grad_norm': '1.508', 'learning_rate': '4.987e-05', 'epoch': '0.2568', 'num_input_tokens_seen': 20877353, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2835', 'grad_norm': '0.776', 'learning_rate': '4.987e-05', 'epoch': '0.2568', 'num_input_tokens_seen': 20879400, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5081', 'grad_norm': '1.026', 'learning_rate': '4.987e-05', 'epoch': '0.2568', 'num_input_tokens_seen': 20881447, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2818', 'grad_norm': '0.8929', 'learning_rate': '4.987e-05', 'epoch': '0.2569', 'num_input_tokens_seen': 20883494, 'train_runtime': '1.056e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4136', 'grad_norm': '0.928', 'learning_rate': '4.987e-05', 'epoch': '0.2569', 'num_input_tokens_seen': 20885541, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9431', 'grad_norm': '1.671', 'learning_rate': '4.987e-05', 'epoch': '0.2569', 'num_input_tokens_seen': 20887588, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3376', 'grad_norm': '0.8828', 'learning_rate': '4.987e-05', 'epoch': '0.2569', 'num_input_tokens_seen': 20889635, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.916', 'grad_norm': '2.924', 'learning_rate': '4.987e-05', 'epoch': '0.257', 'num_input_tokens_seen': 20891682, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.475', 'grad_norm': '2.125', 'learning_rate': '4.987e-05', 'epoch': '0.257', 'num_input_tokens_seen': 20893729, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2653', 'grad_norm': '0.7933', 'learning_rate': '4.987e-05', 'epoch': '0.257', 'num_input_tokens_seen': 20895776, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3363', 'grad_norm': '1.113', 'learning_rate': '4.987e-05', 'epoch': '0.257', 'num_input_tokens_seen': 20897823, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9236', 'grad_norm': '1.424', 'learning_rate': '4.987e-05', 'epoch': '0.2571', 'num_input_tokens_seen': 20899870, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4179', 'grad_norm': '0.9664', 'learning_rate': '4.987e-05', 'epoch': '0.2571', 'num_input_tokens_seen': 20901917, 'train_runtime': '1.057e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2623', 'grad_norm': '0.8802', 'learning_rate': '4.987e-05', 'epoch': '0.2571', 'num_input_tokens_seen': 20903964, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3697', 'grad_norm': '0.9839', 'learning_rate': '4.987e-05', 'epoch': '0.2571', 'num_input_tokens_seen': 20906011, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3846', 'grad_norm': '1.005', 'learning_rate': '4.987e-05', 'epoch': '0.2572', 'num_input_tokens_seen': 20908058, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.42', 'grad_norm': '2.378', 'learning_rate': '4.987e-05', 'epoch': '0.2572', 'num_input_tokens_seen': 20910105, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4288', 'grad_norm': '0.8801', 'learning_rate': '4.987e-05', 'epoch': '0.2572', 'num_input_tokens_seen': 20912152, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4794', 'grad_norm': '0.9817', 'learning_rate': '4.987e-05', 'epoch': '0.2572', 'num_input_tokens_seen': 20914199, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1814', 'grad_norm': '0.7555', 'learning_rate': '4.987e-05', 'epoch': '0.2573', 'num_input_tokens_seen': 20916246, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.423', 'grad_norm': '2.755', 'learning_rate': '4.987e-05', 'epoch': '0.2573', 'num_input_tokens_seen': 20918293, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3423', 'grad_norm': '1.062', 'learning_rate': '4.987e-05', 'epoch': '0.2573', 'num_input_tokens_seen': 20920340, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7581', 'grad_norm': '1.031', 'learning_rate': '4.987e-05', 'epoch': '0.2573', 'num_input_tokens_seen': 20922387, 'train_runtime': '1.058e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4651', 'grad_norm': '1.311', 'learning_rate': '4.987e-05', 'epoch': '0.2574', 'num_input_tokens_seen': 20924434, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.528', 'grad_norm': '1.196', 'learning_rate': '4.987e-05', 'epoch': '0.2574', 'num_input_tokens_seen': 20926481, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8485', 'grad_norm': '1.729', 'learning_rate': '4.987e-05', 'epoch': '0.2574', 'num_input_tokens_seen': 20928528, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.882', 'grad_norm': '1.394', 'learning_rate': '4.987e-05', 'epoch': '0.2574', 'num_input_tokens_seen': 20930575, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7256', 'grad_norm': '1.065', 'learning_rate': '4.987e-05', 'epoch': '0.2575', 'num_input_tokens_seen': 20932622, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '1.605', 'learning_rate': '4.987e-05', 'epoch': '0.2575', 'num_input_tokens_seen': 20934669, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.905', 'grad_norm': '2.592', 'learning_rate': '4.987e-05', 'epoch': '0.2575', 'num_input_tokens_seen': 20936716, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8066', 'grad_norm': '1.386', 'learning_rate': '4.987e-05', 'epoch': '0.2575', 'num_input_tokens_seen': 20938763, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5636', 'grad_norm': '0.9922', 'learning_rate': '4.987e-05', 'epoch': '0.2576', 'num_input_tokens_seen': 20940810, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2082', 'grad_norm': '0.8619', 'learning_rate': '4.987e-05', 'epoch': '0.2576', 'num_input_tokens_seen': 20942857, 'train_runtime': '1.059e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4968', 'grad_norm': '1.15', 'learning_rate': '4.987e-05', 'epoch': '0.2576', 'num_input_tokens_seen': 20944904, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3618', 'grad_norm': '0.8059', 'learning_rate': '4.987e-05', 'epoch': '0.2576', 'num_input_tokens_seen': 20946951, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.899', 'grad_norm': '1.798', 'learning_rate': '4.987e-05', 'epoch': '0.2577', 'num_input_tokens_seen': 20948998, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5873', 'grad_norm': '0.9579', 'learning_rate': '4.987e-05', 'epoch': '0.2577', 'num_input_tokens_seen': 20951045, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5643', 'grad_norm': '1.204', 'learning_rate': '4.987e-05', 'epoch': '0.2577', 'num_input_tokens_seen': 20953092, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5119', 'grad_norm': '1.065', 'learning_rate': '4.987e-05', 'epoch': '0.2577', 'num_input_tokens_seen': 20955139, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4511', 'grad_norm': '1.312', 'learning_rate': '4.987e-05', 'epoch': '0.2578', 'num_input_tokens_seen': 20957186, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5103', 'grad_norm': '1.074', 'learning_rate': '4.987e-05', 'epoch': '0.2578', 'num_input_tokens_seen': 20959233, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6159', 'grad_norm': '0.8611', 'learning_rate': '4.987e-05', 'epoch': '0.2578', 'num_input_tokens_seen': 20961280, 'train_runtime': '1.06e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4604', 'grad_norm': '0.9359', 'learning_rate': '4.987e-05', 'epoch': '0.2578', 'num_input_tokens_seen': 20963327, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3629', 'grad_norm': '0.9491', 'learning_rate': '4.987e-05', 'epoch': '0.2579', 'num_input_tokens_seen': 20965374, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2915', 'grad_norm': '0.8664', 'learning_rate': '4.987e-05', 'epoch': '0.2579', 'num_input_tokens_seen': 20967421, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9145', 'grad_norm': '1.367', 'learning_rate': '4.987e-05', 'epoch': '0.2579', 'num_input_tokens_seen': 20969468, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3715', 'grad_norm': '1.001', 'learning_rate': '4.987e-05', 'epoch': '0.2579', 'num_input_tokens_seen': 20971515, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.386', 'grad_norm': '0.8608', 'learning_rate': '4.987e-05', 'epoch': '0.258', 'num_input_tokens_seen': 20973562, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4801', 'grad_norm': '1.072', 'learning_rate': '4.987e-05', 'epoch': '0.258', 'num_input_tokens_seen': 20975609, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.917', 'grad_norm': '2.108', 'learning_rate': '4.987e-05', 'epoch': '0.258', 'num_input_tokens_seen': 20977656, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6474', 'grad_norm': '1.58', 'learning_rate': '4.987e-05', 'epoch': '0.2581', 'num_input_tokens_seen': 20979703, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.422', 'learning_rate': '4.987e-05', 'epoch': '0.2581', 'num_input_tokens_seen': 20981750, 'train_runtime': '1.061e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.014', 'grad_norm': '1.1', 'learning_rate': '4.987e-05', 'epoch': '0.2581', 'num_input_tokens_seen': 20983797, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.641', 'grad_norm': '2.483', 'learning_rate': '4.987e-05', 'epoch': '0.2581', 'num_input_tokens_seen': 20985844, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6218', 'grad_norm': '1.509', 'learning_rate': '4.987e-05', 'epoch': '0.2582', 'num_input_tokens_seen': 20987891, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8462', 'grad_norm': '1.468', 'learning_rate': '4.987e-05', 'epoch': '0.2582', 'num_input_tokens_seen': 20989938, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5612', 'grad_norm': '1.197', 'learning_rate': '4.987e-05', 'epoch': '0.2582', 'num_input_tokens_seen': 20991985, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.371', 'grad_norm': '2.882', 'learning_rate': '4.987e-05', 'epoch': '0.2582', 'num_input_tokens_seen': 20994032, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8886', 'grad_norm': '1.368', 'learning_rate': '4.987e-05', 'epoch': '0.2583', 'num_input_tokens_seen': 20996079, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9781', 'grad_norm': '1.569', 'learning_rate': '4.987e-05', 'epoch': '0.2583', 'num_input_tokens_seen': 20998126, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2165', 'grad_norm': '1.017', 'learning_rate': '4.987e-05', 'epoch': '0.2583', 'num_input_tokens_seen': 21000173, 'train_runtime': '1.062e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.73', 'grad_norm': '1.474', 'learning_rate': '4.987e-05', 'epoch': '0.2583', 'num_input_tokens_seen': 21002220, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8268', 'grad_norm': '1.198', 'learning_rate': '4.987e-05', 'epoch': '0.2584', 'num_input_tokens_seen': 21004267, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2107', 'grad_norm': '0.8794', 'learning_rate': '4.987e-05', 'epoch': '0.2584', 'num_input_tokens_seen': 21006314, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9267', 'grad_norm': '1.622', 'learning_rate': '4.987e-05', 'epoch': '0.2584', 'num_input_tokens_seen': 21008361, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8367', 'grad_norm': '1.235', 'learning_rate': '4.987e-05', 'epoch': '0.2584', 'num_input_tokens_seen': 21010408, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9727', 'grad_norm': '1.567', 'learning_rate': '4.987e-05', 'epoch': '0.2585', 'num_input_tokens_seen': 21012455, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.251', 'grad_norm': '2.064', 'learning_rate': '4.987e-05', 'epoch': '0.2585', 'num_input_tokens_seen': 21014502, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.445', 'grad_norm': '1.287', 'learning_rate': '4.987e-05', 'epoch': '0.2585', 'num_input_tokens_seen': 21016549, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.532', 'grad_norm': '1.136', 'learning_rate': '4.987e-05', 'epoch': '0.2585', 'num_input_tokens_seen': 21018596, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.073', 'grad_norm': '1.298', 'learning_rate': '4.987e-05', 'epoch': '0.2586', 'num_input_tokens_seen': 21020643, 'train_runtime': '1.063e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.278', 'grad_norm': '2.107', 'learning_rate': '4.987e-05', 'epoch': '0.2586', 'num_input_tokens_seen': 21022690, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.018', 'grad_norm': '1.705', 'learning_rate': '4.987e-05', 'epoch': '0.2586', 'num_input_tokens_seen': 21024737, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6448', 'grad_norm': '1.244', 'learning_rate': '4.987e-05', 'epoch': '0.2586', 'num_input_tokens_seen': 21026784, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7045', 'grad_norm': '1.702', 'learning_rate': '4.987e-05', 'epoch': '0.2587', 'num_input_tokens_seen': 21028831, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2957', 'grad_norm': '0.9633', 'learning_rate': '4.987e-05', 'epoch': '0.2587', 'num_input_tokens_seen': 21030878, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8223', 'grad_norm': '1.351', 'learning_rate': '4.987e-05', 'epoch': '0.2587', 'num_input_tokens_seen': 21032925, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7542', 'grad_norm': '1.344', 'learning_rate': '4.987e-05', 'epoch': '0.2587', 'num_input_tokens_seen': 21034972, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6194', 'grad_norm': '1.215', 'learning_rate': '4.987e-05', 'epoch': '0.2588', 'num_input_tokens_seen': 21037019, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5468', 'grad_norm': '1.221', 'learning_rate': '4.987e-05', 'epoch': '0.2588', 'num_input_tokens_seen': 21039066, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.702', 'grad_norm': '2.096', 'learning_rate': '4.987e-05', 'epoch': '0.2588', 'num_input_tokens_seen': 21041113, 'train_runtime': '1.064e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9925', 'grad_norm': '1.464', 'learning_rate': '4.987e-05', 'epoch': '0.2588', 'num_input_tokens_seen': 21043160, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.948', 'grad_norm': '1.512', 'learning_rate': '4.987e-05', 'epoch': '0.2589', 'num_input_tokens_seen': 21045207, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6318', 'grad_norm': '0.9832', 'learning_rate': '4.987e-05', 'epoch': '0.2589', 'num_input_tokens_seen': 21047254, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.076', 'grad_norm': '1.26', 'learning_rate': '4.987e-05', 'epoch': '0.2589', 'num_input_tokens_seen': 21049301, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2445', 'grad_norm': '0.9547', 'learning_rate': '4.987e-05', 'epoch': '0.2589', 'num_input_tokens_seen': 21051348, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7207', 'grad_norm': '1.39', 'learning_rate': '4.987e-05', 'epoch': '0.259', 'num_input_tokens_seen': 21053395, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.663', 'grad_norm': '2.247', 'learning_rate': '4.987e-05', 'epoch': '0.259', 'num_input_tokens_seen': 21055442, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3114', 'grad_norm': '1.042', 'learning_rate': '4.987e-05', 'epoch': '0.259', 'num_input_tokens_seen': 21057489, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7152', 'grad_norm': '1.124', 'learning_rate': '4.987e-05', 'epoch': '0.259', 'num_input_tokens_seen': 21059536, 'train_runtime': '1.065e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7659', 'grad_norm': '1.861', 'learning_rate': '4.987e-05', 'epoch': '0.2591', 'num_input_tokens_seen': 21061583, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3522', 'grad_norm': '0.8517', 'learning_rate': '4.987e-05', 'epoch': '0.2591', 'num_input_tokens_seen': 21063630, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.238', 'grad_norm': '2.009', 'learning_rate': '4.987e-05', 'epoch': '0.2591', 'num_input_tokens_seen': 21065677, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8869', 'grad_norm': '1.417', 'learning_rate': '4.987e-05', 'epoch': '0.2591', 'num_input_tokens_seen': 21067724, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6234', 'grad_norm': '1.005', 'learning_rate': '4.987e-05', 'epoch': '0.2592', 'num_input_tokens_seen': 21069771, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9768', 'grad_norm': '1.994', 'learning_rate': '4.987e-05', 'epoch': '0.2592', 'num_input_tokens_seen': 21071818, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2641', 'grad_norm': '0.8413', 'learning_rate': '4.987e-05', 'epoch': '0.2592', 'num_input_tokens_seen': 21073865, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5071', 'grad_norm': '1.031', 'learning_rate': '4.987e-05', 'epoch': '0.2592', 'num_input_tokens_seen': 21075912, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.025', 'grad_norm': '1.538', 'learning_rate': '4.987e-05', 'epoch': '0.2593', 'num_input_tokens_seen': 21077959, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8752', 'grad_norm': '1.436', 'learning_rate': '4.987e-05', 'epoch': '0.2593', 'num_input_tokens_seen': 21080006, 'train_runtime': '1.066e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3159', 'grad_norm': '0.9409', 'learning_rate': '4.987e-05', 'epoch': '0.2593', 'num_input_tokens_seen': 21082053, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8249', 'grad_norm': '1.778', 'learning_rate': '4.987e-05', 'epoch': '0.2593', 'num_input_tokens_seen': 21084100, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3104', 'grad_norm': '0.9322', 'learning_rate': '4.987e-05', 'epoch': '0.2594', 'num_input_tokens_seen': 21086147, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.266', 'grad_norm': '1.99', 'learning_rate': '4.987e-05', 'epoch': '0.2594', 'num_input_tokens_seen': 21088194, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2322', 'grad_norm': '0.7375', 'learning_rate': '4.987e-05', 'epoch': '0.2594', 'num_input_tokens_seen': 21090241, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5784', 'grad_norm': '1.478', 'learning_rate': '4.987e-05', 'epoch': '0.2594', 'num_input_tokens_seen': 21092288, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5876', 'grad_norm': '1.304', 'learning_rate': '4.987e-05', 'epoch': '0.2595', 'num_input_tokens_seen': 21094335, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3139', 'grad_norm': '0.9909', 'learning_rate': '4.987e-05', 'epoch': '0.2595', 'num_input_tokens_seen': 21096382, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8814', 'grad_norm': '1.574', 'learning_rate': '4.987e-05', 'epoch': '0.2595', 'num_input_tokens_seen': 21098429, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3492', 'grad_norm': '1.075', 'learning_rate': '4.987e-05', 'epoch': '0.2595', 'num_input_tokens_seen': 21100476, 'train_runtime': '1.067e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.591', 'grad_norm': '2.74', 'learning_rate': '4.987e-05', 'epoch': '0.2596', 'num_input_tokens_seen': 21102523, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8352', 'grad_norm': '1.212', 'learning_rate': '4.987e-05', 'epoch': '0.2596', 'num_input_tokens_seen': 21104570, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4259', 'grad_norm': '0.9133', 'learning_rate': '4.987e-05', 'epoch': '0.2596', 'num_input_tokens_seen': 21106617, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8649', 'grad_norm': '1.587', 'learning_rate': '4.987e-05', 'epoch': '0.2596', 'num_input_tokens_seen': 21108664, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9363', 'grad_norm': '1.945', 'learning_rate': '4.987e-05', 'epoch': '0.2597', 'num_input_tokens_seen': 21110711, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5629', 'grad_norm': '1.308', 'learning_rate': '4.987e-05', 'epoch': '0.2597', 'num_input_tokens_seen': 21112758, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.325', 'grad_norm': '2.259', 'learning_rate': '4.987e-05', 'epoch': '0.2597', 'num_input_tokens_seen': 21114805, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2727', 'grad_norm': '0.9123', 'learning_rate': '4.987e-05', 'epoch': '0.2597', 'num_input_tokens_seen': 21116852, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3186', 'grad_norm': '0.9784', 'learning_rate': '4.987e-05', 'epoch': '0.2598', 'num_input_tokens_seen': 21118899, 'train_runtime': '1.068e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9541', 'grad_norm': '1.471', 'learning_rate': '4.987e-05', 'epoch': '0.2598', 'num_input_tokens_seen': 21120946, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6449', 'grad_norm': '1.014', 'learning_rate': '4.987e-05', 'epoch': '0.2598', 'num_input_tokens_seen': 21122993, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.454', 'grad_norm': '2.462', 'learning_rate': '4.987e-05', 'epoch': '0.2598', 'num_input_tokens_seen': 21125040, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2509', 'grad_norm': '0.8587', 'learning_rate': '4.987e-05', 'epoch': '0.2599', 'num_input_tokens_seen': 21127087, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4803', 'grad_norm': '1.466', 'learning_rate': '4.987e-05', 'epoch': '0.2599', 'num_input_tokens_seen': 21129134, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3468', 'grad_norm': '1.026', 'learning_rate': '4.987e-05', 'epoch': '0.2599', 'num_input_tokens_seen': 21131181, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.881', 'learning_rate': '4.987e-05', 'epoch': '0.2599', 'num_input_tokens_seen': 21133228, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '1.209', 'learning_rate': '4.987e-05', 'epoch': '0.26', 'num_input_tokens_seen': 21135275, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.594', 'grad_norm': '2.659', 'learning_rate': '4.987e-05', 'epoch': '0.26', 'num_input_tokens_seen': 21137322, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5958', 'grad_norm': '1.499', 'learning_rate': '4.987e-05', 'epoch': '0.26', 'num_input_tokens_seen': 21139369, 'train_runtime': '1.069e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8092', 'grad_norm': '1.618', 'learning_rate': '4.987e-05', 'epoch': '0.26', 'num_input_tokens_seen': 21141416, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3673', 'grad_norm': '0.9607', 'learning_rate': '4.987e-05', 'epoch': '0.2601', 'num_input_tokens_seen': 21143463, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3079', 'grad_norm': '0.9567', 'learning_rate': '4.987e-05', 'epoch': '0.2601', 'num_input_tokens_seen': 21145510, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6607', 'grad_norm': '1.408', 'learning_rate': '4.987e-05', 'epoch': '0.2601', 'num_input_tokens_seen': 21147557, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7056', 'grad_norm': '1.243', 'learning_rate': '4.987e-05', 'epoch': '0.2601', 'num_input_tokens_seen': 21149604, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7492', 'grad_norm': '1.464', 'learning_rate': '4.987e-05', 'epoch': '0.2602', 'num_input_tokens_seen': 21151651, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8165', 'grad_norm': '1.2', 'learning_rate': '4.987e-05', 'epoch': '0.2602', 'num_input_tokens_seen': 21153698, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8396', 'grad_norm': '1.419', 'learning_rate': '4.987e-05', 'epoch': '0.2602', 'num_input_tokens_seen': 21155745, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6666', 'grad_norm': '1.188', 'learning_rate': '4.987e-05', 'epoch': '0.2602', 'num_input_tokens_seen': 21157792, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3719', 'grad_norm': '0.8878', 'learning_rate': '4.987e-05', 'epoch': '0.2603', 'num_input_tokens_seen': 21159839, 'train_runtime': '1.07e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7389', 'grad_norm': '1.02', 'learning_rate': '4.987e-05', 'epoch': '0.2603', 'num_input_tokens_seen': 21161886, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6813', 'grad_norm': '1.438', 'learning_rate': '4.987e-05', 'epoch': '0.2603', 'num_input_tokens_seen': 21163933, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3576', 'grad_norm': '0.8175', 'learning_rate': '4.987e-05', 'epoch': '0.2603', 'num_input_tokens_seen': 21165980, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5542', 'grad_norm': '0.8402', 'learning_rate': '4.987e-05', 'epoch': '0.2604', 'num_input_tokens_seen': 21168027, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.17', 'grad_norm': '2.434', 'learning_rate': '4.987e-05', 'epoch': '0.2604', 'num_input_tokens_seen': 21170074, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2758', 'grad_norm': '0.945', 'learning_rate': '4.987e-05', 'epoch': '0.2604', 'num_input_tokens_seen': 21172121, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7158', 'grad_norm': '1.326', 'learning_rate': '4.987e-05', 'epoch': '0.2604', 'num_input_tokens_seen': 21174168, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.02', 'grad_norm': '1.18', 'learning_rate': '4.987e-05', 'epoch': '0.2605', 'num_input_tokens_seen': 21176215, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.568', 'grad_norm': '1.41', 'learning_rate': '4.987e-05', 'epoch': '0.2605', 'num_input_tokens_seen': 21178262, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9278', 'grad_norm': '1.835', 'learning_rate': '4.987e-05', 'epoch': '0.2605', 'num_input_tokens_seen': 21180309, 'train_runtime': '1.071e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4309', 'grad_norm': '1.047', 'learning_rate': '4.987e-05', 'epoch': '0.2605', 'num_input_tokens_seen': 21182356, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8123', 'grad_norm': '1.485', 'learning_rate': '4.987e-05', 'epoch': '0.2606', 'num_input_tokens_seen': 21184403, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4254', 'grad_norm': '1.333', 'learning_rate': '4.987e-05', 'epoch': '0.2606', 'num_input_tokens_seen': 21186450, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3891', 'grad_norm': '1.083', 'learning_rate': '4.987e-05', 'epoch': '0.2606', 'num_input_tokens_seen': 21188497, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3531', 'grad_norm': '0.7771', 'learning_rate': '4.987e-05', 'epoch': '0.2606', 'num_input_tokens_seen': 21190544, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9473', 'grad_norm': '1.962', 'learning_rate': '4.987e-05', 'epoch': '0.2607', 'num_input_tokens_seen': 21192591, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4906', 'grad_norm': '1.027', 'learning_rate': '4.987e-05', 'epoch': '0.2607', 'num_input_tokens_seen': 21194638, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.015', 'grad_norm': '2.012', 'learning_rate': '4.987e-05', 'epoch': '0.2607', 'num_input_tokens_seen': 21196685, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4004', 'grad_norm': '0.8369', 'learning_rate': '4.987e-05', 'epoch': '0.2607', 'num_input_tokens_seen': 21198732, 'train_runtime': '1.072e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5784', 'grad_norm': '1.176', 'learning_rate': '4.987e-05', 'epoch': '0.2608', 'num_input_tokens_seen': 21200779, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9983', 'grad_norm': '1.36', 'learning_rate': '4.987e-05', 'epoch': '0.2608', 'num_input_tokens_seen': 21202826, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8982', 'grad_norm': '1.533', 'learning_rate': '4.987e-05', 'epoch': '0.2608', 'num_input_tokens_seen': 21204873, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9231', 'grad_norm': '1.968', 'learning_rate': '4.987e-05', 'epoch': '0.2608', 'num_input_tokens_seen': 21206920, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.783', 'grad_norm': '1.149', 'learning_rate': '4.987e-05', 'epoch': '0.2609', 'num_input_tokens_seen': 21208967, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.419', 'grad_norm': '2.407', 'learning_rate': '4.987e-05', 'epoch': '0.2609', 'num_input_tokens_seen': 21211014, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.648', 'grad_norm': '2.323', 'learning_rate': '4.987e-05', 'epoch': '0.2609', 'num_input_tokens_seen': 21213061, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5357', 'grad_norm': '1.21', 'learning_rate': '4.987e-05', 'epoch': '0.2609', 'num_input_tokens_seen': 21215108, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5692', 'grad_norm': '1.468', 'learning_rate': '4.987e-05', 'epoch': '0.261', 'num_input_tokens_seen': 21217155, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6018', 'grad_norm': '1.144', 'learning_rate': '4.987e-05', 'epoch': '0.261', 'num_input_tokens_seen': 21219202, 'train_runtime': '1.073e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7218', 'grad_norm': '1.628', 'learning_rate': '4.987e-05', 'epoch': '0.261', 'num_input_tokens_seen': 21221249, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.061', 'grad_norm': '1.951', 'learning_rate': '4.987e-05', 'epoch': '0.261', 'num_input_tokens_seen': 21223296, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5817', 'grad_norm': '1.613', 'learning_rate': '4.987e-05', 'epoch': '0.2611', 'num_input_tokens_seen': 21225343, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.173', 'grad_norm': '1.652', 'learning_rate': '4.987e-05', 'epoch': '0.2611', 'num_input_tokens_seen': 21227390, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8498', 'grad_norm': '1.529', 'learning_rate': '4.987e-05', 'epoch': '0.2611', 'num_input_tokens_seen': 21229437, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4559', 'grad_norm': '1.573', 'learning_rate': '4.987e-05', 'epoch': '0.2611', 'num_input_tokens_seen': 21231484, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4682', 'grad_norm': '1.018', 'learning_rate': '4.987e-05', 'epoch': '0.2612', 'num_input_tokens_seen': 21233531, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8079', 'grad_norm': '1.127', 'learning_rate': '4.987e-05', 'epoch': '0.2612', 'num_input_tokens_seen': 21235578, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '1.432', 'learning_rate': '4.987e-05', 'epoch': '0.2612', 'num_input_tokens_seen': 21237625, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3246', 'grad_norm': '0.9094', 'learning_rate': '4.987e-05', 'epoch': '0.2612', 'num_input_tokens_seen': 21239672, 'train_runtime': '1.074e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8263', 'grad_norm': '1.483', 'learning_rate': '4.987e-05', 'epoch': '0.2613', 'num_input_tokens_seen': 21241719, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5211', 'grad_norm': '1.093', 'learning_rate': '4.987e-05', 'epoch': '0.2613', 'num_input_tokens_seen': 21243766, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4977', 'grad_norm': '1.252', 'learning_rate': '4.987e-05', 'epoch': '0.2613', 'num_input_tokens_seen': 21245813, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6287', 'grad_norm': '1.431', 'learning_rate': '4.987e-05', 'epoch': '0.2613', 'num_input_tokens_seen': 21247860, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5648', 'grad_norm': '1.381', 'learning_rate': '4.987e-05', 'epoch': '0.2614', 'num_input_tokens_seen': 21249907, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4681', 'grad_norm': '1.136', 'learning_rate': '4.987e-05', 'epoch': '0.2614', 'num_input_tokens_seen': 21251954, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7874', 'grad_norm': '1.215', 'learning_rate': '4.987e-05', 'epoch': '0.2614', 'num_input_tokens_seen': 21254001, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4274', 'grad_norm': '1.015', 'learning_rate': '4.987e-05', 'epoch': '0.2614', 'num_input_tokens_seen': 21256048, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8174', 'grad_norm': '1.377', 'learning_rate': '4.987e-05', 'epoch': '0.2615', 'num_input_tokens_seen': 21258095, 'train_runtime': '1.075e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.204', 'grad_norm': '2.161', 'learning_rate': '4.987e-05', 'epoch': '0.2615', 'num_input_tokens_seen': 21260142, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4636', 'grad_norm': '1.177', 'learning_rate': '4.987e-05', 'epoch': '0.2615', 'num_input_tokens_seen': 21262189, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2168', 'grad_norm': '0.8061', 'learning_rate': '4.987e-05', 'epoch': '0.2616', 'num_input_tokens_seen': 21264236, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5637', 'grad_norm': '1.495', 'learning_rate': '4.987e-05', 'epoch': '0.2616', 'num_input_tokens_seen': 21266283, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5147', 'grad_norm': '1.209', 'learning_rate': '4.987e-05', 'epoch': '0.2616', 'num_input_tokens_seen': 21268330, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5343', 'grad_norm': '1.249', 'learning_rate': '4.987e-05', 'epoch': '0.2616', 'num_input_tokens_seen': 21270377, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5326', 'grad_norm': '1.292', 'learning_rate': '4.987e-05', 'epoch': '0.2617', 'num_input_tokens_seen': 21272424, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.664', 'grad_norm': '2.798', 'learning_rate': '4.987e-05', 'epoch': '0.2617', 'num_input_tokens_seen': 21274471, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8096', 'grad_norm': '1.138', 'learning_rate': '4.987e-05', 'epoch': '0.2617', 'num_input_tokens_seen': 21276518, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3601', 'grad_norm': '0.9908', 'learning_rate': '4.987e-05', 'epoch': '0.2617', 'num_input_tokens_seen': 21278565, 'train_runtime': '1.076e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5481', 'grad_norm': '1.059', 'learning_rate': '4.987e-05', 'epoch': '0.2618', 'num_input_tokens_seen': 21280612, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7051', 'grad_norm': '1.272', 'learning_rate': '4.987e-05', 'epoch': '0.2618', 'num_input_tokens_seen': 21282659, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5289', 'grad_norm': '1.259', 'learning_rate': '4.987e-05', 'epoch': '0.2618', 'num_input_tokens_seen': 21284706, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8758', 'grad_norm': '1.373', 'learning_rate': '4.987e-05', 'epoch': '0.2618', 'num_input_tokens_seen': 21286753, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4669', 'grad_norm': '1.078', 'learning_rate': '4.987e-05', 'epoch': '0.2619', 'num_input_tokens_seen': 21288800, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4396', 'grad_norm': '1.209', 'learning_rate': '4.987e-05', 'epoch': '0.2619', 'num_input_tokens_seen': 21290847, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.115', 'grad_norm': '1.799', 'learning_rate': '4.987e-05', 'epoch': '0.2619', 'num_input_tokens_seen': 21292894, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8596', 'grad_norm': '1.37', 'learning_rate': '4.987e-05', 'epoch': '0.2619', 'num_input_tokens_seen': 21294941, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9742', 'grad_norm': '1.013', 'learning_rate': '4.987e-05', 'epoch': '0.262', 'num_input_tokens_seen': 21296988, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2283', 'grad_norm': '0.8444', 'learning_rate': '4.987e-05', 'epoch': '0.262', 'num_input_tokens_seen': 21299035, 'train_runtime': '1.077e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2729', 'grad_norm': '0.7931', 'learning_rate': '4.987e-05', 'epoch': '0.262', 'num_input_tokens_seen': 21301082, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5873', 'grad_norm': '1.33', 'learning_rate': '4.987e-05', 'epoch': '0.262', 'num_input_tokens_seen': 21303129, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9392', 'grad_norm': '1.407', 'learning_rate': '4.987e-05', 'epoch': '0.2621', 'num_input_tokens_seen': 21305176, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6704', 'grad_norm': '1.239', 'learning_rate': '4.987e-05', 'epoch': '0.2621', 'num_input_tokens_seen': 21307223, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.017', 'grad_norm': '1.748', 'learning_rate': '4.987e-05', 'epoch': '0.2621', 'num_input_tokens_seen': 21309270, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4717', 'grad_norm': '1.139', 'learning_rate': '4.987e-05', 'epoch': '0.2621', 'num_input_tokens_seen': 21311317, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3135', 'grad_norm': '0.8147', 'learning_rate': '4.987e-05', 'epoch': '0.2622', 'num_input_tokens_seen': 21313364, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3051', 'grad_norm': '0.8163', 'learning_rate': '4.987e-05', 'epoch': '0.2622', 'num_input_tokens_seen': 21315411, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4366', 'grad_norm': '0.9838', 'learning_rate': '4.986e-05', 'epoch': '0.2622', 'num_input_tokens_seen': 21317458, 'train_runtime': '1.078e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4494', 'grad_norm': '1.32', 'learning_rate': '4.986e-05', 'epoch': '0.2622', 'num_input_tokens_seen': 21319505, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3537', 'grad_norm': '0.835', 'learning_rate': '4.986e-05', 'epoch': '0.2623', 'num_input_tokens_seen': 21321552, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5018', 'grad_norm': '1.19', 'learning_rate': '4.986e-05', 'epoch': '0.2623', 'num_input_tokens_seen': 21323599, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.096', 'grad_norm': '1.684', 'learning_rate': '4.986e-05', 'epoch': '0.2623', 'num_input_tokens_seen': 21325646, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8206', 'grad_norm': '1.579', 'learning_rate': '4.986e-05', 'epoch': '0.2623', 'num_input_tokens_seen': 21327693, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2431', 'grad_norm': '0.9861', 'learning_rate': '4.986e-05', 'epoch': '0.2624', 'num_input_tokens_seen': 21329740, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.331', 'grad_norm': '2.28', 'learning_rate': '4.986e-05', 'epoch': '0.2624', 'num_input_tokens_seen': 21331787, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5768', 'grad_norm': '1.287', 'learning_rate': '4.986e-05', 'epoch': '0.2624', 'num_input_tokens_seen': 21333834, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3074', 'grad_norm': '0.7817', 'learning_rate': '4.986e-05', 'epoch': '0.2624', 'num_input_tokens_seen': 21335881, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6399', 'grad_norm': '1.018', 'learning_rate': '4.986e-05', 'epoch': '0.2625', 'num_input_tokens_seen': 21337928, 'train_runtime': '1.079e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4674', 'grad_norm': '1.171', 'learning_rate': '4.986e-05', 'epoch': '0.2625', 'num_input_tokens_seen': 21339975, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.505', 'grad_norm': '2.148', 'learning_rate': '4.986e-05', 'epoch': '0.2625', 'num_input_tokens_seen': 21342022, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5559', 'grad_norm': '1.526', 'learning_rate': '4.986e-05', 'epoch': '0.2625', 'num_input_tokens_seen': 21344069, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.353', 'grad_norm': '3.046', 'learning_rate': '4.986e-05', 'epoch': '0.2626', 'num_input_tokens_seen': 21346116, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.718', 'grad_norm': '2.422', 'learning_rate': '4.986e-05', 'epoch': '0.2626', 'num_input_tokens_seen': 21348163, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.891', 'grad_norm': '2.638', 'learning_rate': '4.986e-05', 'epoch': '0.2626', 'num_input_tokens_seen': 21350210, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5057', 'grad_norm': '1.227', 'learning_rate': '4.986e-05', 'epoch': '0.2626', 'num_input_tokens_seen': 21352257, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.364', 'grad_norm': '1.909', 'learning_rate': '4.986e-05', 'epoch': '0.2627', 'num_input_tokens_seen': 21354304, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.025', 'grad_norm': '2.86', 'learning_rate': '4.986e-05', 'epoch': '0.2627', 'num_input_tokens_seen': 21356351, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.319', 'grad_norm': '1.897', 'learning_rate': '4.986e-05', 'epoch': '0.2627', 'num_input_tokens_seen': 21358398, 'train_runtime': '1.08e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.228', 'grad_norm': '1.66', 'learning_rate': '4.986e-05', 'epoch': '0.2627', 'num_input_tokens_seen': 21360445, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6677', 'grad_norm': '1.265', 'learning_rate': '4.986e-05', 'epoch': '0.2628', 'num_input_tokens_seen': 21362492, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.037', 'grad_norm': '1.801', 'learning_rate': '4.986e-05', 'epoch': '0.2628', 'num_input_tokens_seen': 21364539, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.935', 'grad_norm': '1.55', 'learning_rate': '4.986e-05', 'epoch': '0.2628', 'num_input_tokens_seen': 21366586, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5951', 'grad_norm': '1.179', 'learning_rate': '4.986e-05', 'epoch': '0.2628', 'num_input_tokens_seen': 21368633, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3842', 'grad_norm': '1.343', 'learning_rate': '4.986e-05', 'epoch': '0.2629', 'num_input_tokens_seen': 21370680, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3796', 'grad_norm': '0.9269', 'learning_rate': '4.986e-05', 'epoch': '0.2629', 'num_input_tokens_seen': 21372727, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8808', 'grad_norm': '1.506', 'learning_rate': '4.986e-05', 'epoch': '0.2629', 'num_input_tokens_seen': 21374774, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6506', 'grad_norm': '1.239', 'learning_rate': '4.986e-05', 'epoch': '0.2629', 'num_input_tokens_seen': 21376821, 'train_runtime': '1.081e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.162', 'grad_norm': '2.318', 'learning_rate': '4.986e-05', 'epoch': '0.263', 'num_input_tokens_seen': 21378868, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4062', 'grad_norm': '1.02', 'learning_rate': '4.986e-05', 'epoch': '0.263', 'num_input_tokens_seen': 21380915, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7039', 'grad_norm': '1.184', 'learning_rate': '4.986e-05', 'epoch': '0.263', 'num_input_tokens_seen': 21382962, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8126', 'grad_norm': '1.347', 'learning_rate': '4.986e-05', 'epoch': '0.263', 'num_input_tokens_seen': 21385009, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3617', 'grad_norm': '0.932', 'learning_rate': '4.986e-05', 'epoch': '0.2631', 'num_input_tokens_seen': 21387056, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3621', 'grad_norm': '1.099', 'learning_rate': '4.986e-05', 'epoch': '0.2631', 'num_input_tokens_seen': 21389103, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7327', 'grad_norm': '1.539', 'learning_rate': '4.986e-05', 'epoch': '0.2631', 'num_input_tokens_seen': 21391150, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2431', 'grad_norm': '0.7369', 'learning_rate': '4.986e-05', 'epoch': '0.2631', 'num_input_tokens_seen': 21393197, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3514', 'grad_norm': '1.04', 'learning_rate': '4.986e-05', 'epoch': '0.2632', 'num_input_tokens_seen': 21395244, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2342', 'grad_norm': '0.9085', 'learning_rate': '4.986e-05', 'epoch': '0.2632', 'num_input_tokens_seen': 21397291, 'train_runtime': '1.082e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2454', 'grad_norm': '0.9199', 'learning_rate': '4.986e-05', 'epoch': '0.2632', 'num_input_tokens_seen': 21399338, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3263', 'grad_norm': '0.9235', 'learning_rate': '4.986e-05', 'epoch': '0.2632', 'num_input_tokens_seen': 21401385, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3395', 'grad_norm': '0.8309', 'learning_rate': '4.986e-05', 'epoch': '0.2633', 'num_input_tokens_seen': 21403432, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4283', 'grad_norm': '1.078', 'learning_rate': '4.986e-05', 'epoch': '0.2633', 'num_input_tokens_seen': 21405479, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.28', 'grad_norm': '1.784', 'learning_rate': '4.986e-05', 'epoch': '0.2633', 'num_input_tokens_seen': 21407526, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.286', 'grad_norm': '3.06', 'learning_rate': '4.986e-05', 'epoch': '0.2633', 'num_input_tokens_seen': 21409573, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.272', 'grad_norm': '0.8332', 'learning_rate': '4.986e-05', 'epoch': '0.2634', 'num_input_tokens_seen': 21411620, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6536', 'grad_norm': '1.156', 'learning_rate': '4.986e-05', 'epoch': '0.2634', 'num_input_tokens_seen': 21413667, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3465', 'grad_norm': '0.9528', 'learning_rate': '4.986e-05', 'epoch': '0.2634', 'num_input_tokens_seen': 21415714, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7406', 'grad_norm': '1.44', 'learning_rate': '4.986e-05', 'epoch': '0.2634', 'num_input_tokens_seen': 21417761, 'train_runtime': '1.083e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.055', 'grad_norm': '2.129', 'learning_rate': '4.986e-05', 'epoch': '0.2635', 'num_input_tokens_seen': 21419808, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5792', 'grad_norm': '1.267', 'learning_rate': '4.986e-05', 'epoch': '0.2635', 'num_input_tokens_seen': 21421855, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5582', 'grad_norm': '1.093', 'learning_rate': '4.986e-05', 'epoch': '0.2635', 'num_input_tokens_seen': 21423902, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6297', 'grad_norm': '1.208', 'learning_rate': '4.986e-05', 'epoch': '0.2635', 'num_input_tokens_seen': 21425949, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3718', 'grad_norm': '1.133', 'learning_rate': '4.986e-05', 'epoch': '0.2636', 'num_input_tokens_seen': 21427996, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5534', 'grad_norm': '1.173', 'learning_rate': '4.986e-05', 'epoch': '0.2636', 'num_input_tokens_seen': 21430043, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5178', 'grad_norm': '1.123', 'learning_rate': '4.986e-05', 'epoch': '0.2636', 'num_input_tokens_seen': 21432090, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4198', 'grad_norm': '1.149', 'learning_rate': '4.986e-05', 'epoch': '0.2636', 'num_input_tokens_seen': 21434137, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6312', 'grad_norm': '1.494', 'learning_rate': '4.986e-05', 'epoch': '0.2637', 'num_input_tokens_seen': 21436184, 'train_runtime': '1.084e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.811', 'grad_norm': '1.828', 'learning_rate': '4.986e-05', 'epoch': '0.2637', 'num_input_tokens_seen': 21438231, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.333', 'grad_norm': '2.373', 'learning_rate': '4.986e-05', 'epoch': '0.2637', 'num_input_tokens_seen': 21440278, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6389', 'grad_norm': '1.311', 'learning_rate': '4.986e-05', 'epoch': '0.2637', 'num_input_tokens_seen': 21442325, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8611', 'grad_norm': '1.465', 'learning_rate': '4.986e-05', 'epoch': '0.2638', 'num_input_tokens_seen': 21444372, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6212', 'grad_norm': '1.363', 'learning_rate': '4.986e-05', 'epoch': '0.2638', 'num_input_tokens_seen': 21446419, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.457', 'grad_norm': '0.9153', 'learning_rate': '4.986e-05', 'epoch': '0.2638', 'num_input_tokens_seen': 21448466, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.048', 'grad_norm': '1.868', 'learning_rate': '4.986e-05', 'epoch': '0.2638', 'num_input_tokens_seen': 21450513, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4137', 'grad_norm': '0.9059', 'learning_rate': '4.986e-05', 'epoch': '0.2639', 'num_input_tokens_seen': 21452560, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3629', 'grad_norm': '1.015', 'learning_rate': '4.986e-05', 'epoch': '0.2639', 'num_input_tokens_seen': 21454607, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4888', 'grad_norm': '1.218', 'learning_rate': '4.986e-05', 'epoch': '0.2639', 'num_input_tokens_seen': 21456654, 'train_runtime': '1.085e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.244', 'grad_norm': '2.835', 'learning_rate': '4.986e-05', 'epoch': '0.2639', 'num_input_tokens_seen': 21458701, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5232', 'grad_norm': '1.254', 'learning_rate': '4.986e-05', 'epoch': '0.264', 'num_input_tokens_seen': 21460748, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4498', 'grad_norm': '1.268', 'learning_rate': '4.986e-05', 'epoch': '0.264', 'num_input_tokens_seen': 21462795, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5505', 'grad_norm': '1.214', 'learning_rate': '4.986e-05', 'epoch': '0.264', 'num_input_tokens_seen': 21464842, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3149', 'grad_norm': '0.8586', 'learning_rate': '4.986e-05', 'epoch': '0.264', 'num_input_tokens_seen': 21466889, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8234', 'grad_norm': '1.518', 'learning_rate': '4.986e-05', 'epoch': '0.2641', 'num_input_tokens_seen': 21468936, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.237', 'grad_norm': '1.93', 'learning_rate': '4.986e-05', 'epoch': '0.2641', 'num_input_tokens_seen': 21470983, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6083', 'grad_norm': '1.361', 'learning_rate': '4.986e-05', 'epoch': '0.2641', 'num_input_tokens_seen': 21473030, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2456', 'grad_norm': '0.9972', 'learning_rate': '4.986e-05', 'epoch': '0.2641', 'num_input_tokens_seen': 21475077, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3285', 'grad_norm': '0.9446', 'learning_rate': '4.986e-05', 'epoch': '0.2642', 'num_input_tokens_seen': 21477124, 'train_runtime': '1.086e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2785', 'grad_norm': '1.031', 'learning_rate': '4.986e-05', 'epoch': '0.2642', 'num_input_tokens_seen': 21479171, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1939', 'grad_norm': '0.7734', 'learning_rate': '4.986e-05', 'epoch': '0.2642', 'num_input_tokens_seen': 21481218, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.827', 'grad_norm': '1.225', 'learning_rate': '4.986e-05', 'epoch': '0.2642', 'num_input_tokens_seen': 21483265, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.478', 'grad_norm': '0.9986', 'learning_rate': '4.986e-05', 'epoch': '0.2643', 'num_input_tokens_seen': 21485312, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.151', 'grad_norm': '2.222', 'learning_rate': '4.986e-05', 'epoch': '0.2643', 'num_input_tokens_seen': 21487359, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6028', 'grad_norm': '1.249', 'learning_rate': '4.986e-05', 'epoch': '0.2643', 'num_input_tokens_seen': 21489406, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2433', 'grad_norm': '0.7661', 'learning_rate': '4.986e-05', 'epoch': '0.2643', 'num_input_tokens_seen': 21491453, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3249', 'grad_norm': '0.7911', 'learning_rate': '4.986e-05', 'epoch': '0.2644', 'num_input_tokens_seen': 21493500, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7123', 'grad_norm': '0.848', 'learning_rate': '4.986e-05', 'epoch': '0.2644', 'num_input_tokens_seen': 21495547, 'train_runtime': '1.087e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5992', 'grad_norm': '1.212', 'learning_rate': '4.986e-05', 'epoch': '0.2644', 'num_input_tokens_seen': 21497594, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1932', 'grad_norm': '0.8617', 'learning_rate': '4.986e-05', 'epoch': '0.2644', 'num_input_tokens_seen': 21499641, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5867', 'grad_norm': '1.426', 'learning_rate': '4.986e-05', 'epoch': '0.2645', 'num_input_tokens_seen': 21501688, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7046', 'grad_norm': '1.325', 'learning_rate': '4.986e-05', 'epoch': '0.2645', 'num_input_tokens_seen': 21503735, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.236', 'grad_norm': '2.487', 'learning_rate': '4.986e-05', 'epoch': '0.2645', 'num_input_tokens_seen': 21505782, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7299', 'grad_norm': '1.426', 'learning_rate': '4.986e-05', 'epoch': '0.2645', 'num_input_tokens_seen': 21507829, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4719', 'grad_norm': '0.9972', 'learning_rate': '4.986e-05', 'epoch': '0.2646', 'num_input_tokens_seen': 21509876, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3243', 'grad_norm': '0.8711', 'learning_rate': '4.986e-05', 'epoch': '0.2646', 'num_input_tokens_seen': 21511923, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4345', 'grad_norm': '1.15', 'learning_rate': '4.986e-05', 'epoch': '0.2646', 'num_input_tokens_seen': 21513970, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5554', 'grad_norm': '1.215', 'learning_rate': '4.986e-05', 'epoch': '0.2646', 'num_input_tokens_seen': 21516017, 'train_runtime': '1.088e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8021', 'grad_norm': '1.589', 'learning_rate': '4.986e-05', 'epoch': '0.2647', 'num_input_tokens_seen': 21518064, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3739', 'grad_norm': '0.8941', 'learning_rate': '4.986e-05', 'epoch': '0.2647', 'num_input_tokens_seen': 21520111, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5226', 'grad_norm': '1.29', 'learning_rate': '4.986e-05', 'epoch': '0.2647', 'num_input_tokens_seen': 21522158, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.975', 'grad_norm': '2.238', 'learning_rate': '4.986e-05', 'epoch': '0.2647', 'num_input_tokens_seen': 21524205, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3523', 'grad_norm': '0.7792', 'learning_rate': '4.986e-05', 'epoch': '0.2648', 'num_input_tokens_seen': 21526252, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8347', 'grad_norm': '1.397', 'learning_rate': '4.986e-05', 'epoch': '0.2648', 'num_input_tokens_seen': 21528299, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.56', 'grad_norm': '2.737', 'learning_rate': '4.986e-05', 'epoch': '0.2648', 'num_input_tokens_seen': 21530346, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8016', 'grad_norm': '0.946', 'learning_rate': '4.986e-05', 'epoch': '0.2648', 'num_input_tokens_seen': 21532393, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2525', 'grad_norm': '0.8501', 'learning_rate': '4.986e-05', 'epoch': '0.2649', 'num_input_tokens_seen': 21534440, 'train_runtime': '1.089e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4178', 'grad_norm': '1.157', 'learning_rate': '4.986e-05', 'epoch': '0.2649', 'num_input_tokens_seen': 21536487, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9917', 'grad_norm': '1.364', 'learning_rate': '4.986e-05', 'epoch': '0.2649', 'num_input_tokens_seen': 21538534, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3012', 'grad_norm': '0.7973', 'learning_rate': '4.986e-05', 'epoch': '0.2649', 'num_input_tokens_seen': 21540581, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6652', 'grad_norm': '0.9134', 'learning_rate': '4.986e-05', 'epoch': '0.265', 'num_input_tokens_seen': 21542628, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4099', 'grad_norm': '0.7401', 'learning_rate': '4.986e-05', 'epoch': '0.265', 'num_input_tokens_seen': 21544675, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.527', 'grad_norm': '1.058', 'learning_rate': '4.986e-05', 'epoch': '0.265', 'num_input_tokens_seen': 21546722, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8901', 'grad_norm': '1.444', 'learning_rate': '4.986e-05', 'epoch': '0.2651', 'num_input_tokens_seen': 21548769, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.065', 'grad_norm': '2.138', 'learning_rate': '4.986e-05', 'epoch': '0.2651', 'num_input_tokens_seen': 21550816, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7911', 'grad_norm': '1.186', 'learning_rate': '4.986e-05', 'epoch': '0.2651', 'num_input_tokens_seen': 21552863, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9453', 'grad_norm': '1.059', 'learning_rate': '4.986e-05', 'epoch': '0.2651', 'num_input_tokens_seen': 21554910, 'train_runtime': '1.09e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5598', 'grad_norm': '1.468', 'learning_rate': '4.986e-05', 'epoch': '0.2652', 'num_input_tokens_seen': 21556957, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6395', 'grad_norm': '1.007', 'learning_rate': '4.986e-05', 'epoch': '0.2652', 'num_input_tokens_seen': 21559004, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5302', 'grad_norm': '1.075', 'learning_rate': '4.986e-05', 'epoch': '0.2652', 'num_input_tokens_seen': 21561051, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8714', 'grad_norm': '1.877', 'learning_rate': '4.986e-05', 'epoch': '0.2652', 'num_input_tokens_seen': 21563098, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5403', 'grad_norm': '1.566', 'learning_rate': '4.986e-05', 'epoch': '0.2653', 'num_input_tokens_seen': 21565145, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4319', 'grad_norm': '0.9528', 'learning_rate': '4.986e-05', 'epoch': '0.2653', 'num_input_tokens_seen': 21567192, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.221', 'grad_norm': '2.087', 'learning_rate': '4.986e-05', 'epoch': '0.2653', 'num_input_tokens_seen': 21569239, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7047', 'grad_norm': '1.06', 'learning_rate': '4.986e-05', 'epoch': '0.2653', 'num_input_tokens_seen': 21571286, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.189', 'grad_norm': '2.249', 'learning_rate': '4.986e-05', 'epoch': '0.2654', 'num_input_tokens_seen': 21573333, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8252', 'grad_norm': '2.301', 'learning_rate': '4.986e-05', 'epoch': '0.2654', 'num_input_tokens_seen': 21575380, 'train_runtime': '1.091e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.633', 'grad_norm': '2.327', 'learning_rate': '4.986e-05', 'epoch': '0.2654', 'num_input_tokens_seen': 21577427, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3665', 'grad_norm': '0.8953', 'learning_rate': '4.986e-05', 'epoch': '0.2654', 'num_input_tokens_seen': 21579474, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.987', 'grad_norm': '2.837', 'learning_rate': '4.986e-05', 'epoch': '0.2655', 'num_input_tokens_seen': 21581521, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.07', 'grad_norm': '2.371', 'learning_rate': '4.986e-05', 'epoch': '0.2655', 'num_input_tokens_seen': 21583568, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9629', 'grad_norm': '1.695', 'learning_rate': '4.986e-05', 'epoch': '0.2655', 'num_input_tokens_seen': 21585615, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2825', 'grad_norm': '0.8731', 'learning_rate': '4.986e-05', 'epoch': '0.2655', 'num_input_tokens_seen': 21587662, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3163', 'grad_norm': '0.7393', 'learning_rate': '4.986e-05', 'epoch': '0.2656', 'num_input_tokens_seen': 21589709, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2583', 'grad_norm': '0.9668', 'learning_rate': '4.986e-05', 'epoch': '0.2656', 'num_input_tokens_seen': 21591756, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8029', 'grad_norm': '1.15', 'learning_rate': '4.986e-05', 'epoch': '0.2656', 'num_input_tokens_seen': 21593803, 'train_runtime': '1.092e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5939', 'grad_norm': '1.306', 'learning_rate': '4.986e-05', 'epoch': '0.2656', 'num_input_tokens_seen': 21595850, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4772', 'grad_norm': '0.9782', 'learning_rate': '4.986e-05', 'epoch': '0.2657', 'num_input_tokens_seen': 21597897, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9023', 'grad_norm': '1.429', 'learning_rate': '4.986e-05', 'epoch': '0.2657', 'num_input_tokens_seen': 21599944, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7029', 'grad_norm': '1.285', 'learning_rate': '4.986e-05', 'epoch': '0.2657', 'num_input_tokens_seen': 21601991, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2903', 'grad_norm': '0.9566', 'learning_rate': '4.986e-05', 'epoch': '0.2657', 'num_input_tokens_seen': 21604038, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8938', 'grad_norm': '1.638', 'learning_rate': '4.986e-05', 'epoch': '0.2658', 'num_input_tokens_seen': 21606085, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7092', 'grad_norm': '1.463', 'learning_rate': '4.986e-05', 'epoch': '0.2658', 'num_input_tokens_seen': 21608132, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5614', 'grad_norm': '1.528', 'learning_rate': '4.986e-05', 'epoch': '0.2658', 'num_input_tokens_seen': 21610179, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3153', 'grad_norm': '0.8933', 'learning_rate': '4.986e-05', 'epoch': '0.2658', 'num_input_tokens_seen': 21612226, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8529', 'grad_norm': '1.504', 'learning_rate': '4.986e-05', 'epoch': '0.2659', 'num_input_tokens_seen': 21614273, 'train_runtime': '1.093e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8507', 'grad_norm': '1.254', 'learning_rate': '4.986e-05', 'epoch': '0.2659', 'num_input_tokens_seen': 21616320, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.165', 'grad_norm': '1.343', 'learning_rate': '4.986e-05', 'epoch': '0.2659', 'num_input_tokens_seen': 21618367, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6605', 'grad_norm': '1.176', 'learning_rate': '4.986e-05', 'epoch': '0.2659', 'num_input_tokens_seen': 21620414, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2765', 'grad_norm': '0.9386', 'learning_rate': '4.986e-05', 'epoch': '0.266', 'num_input_tokens_seen': 21622461, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4351', 'grad_norm': '0.9443', 'learning_rate': '4.986e-05', 'epoch': '0.266', 'num_input_tokens_seen': 21624508, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.177', 'grad_norm': '2.001', 'learning_rate': '4.986e-05', 'epoch': '0.266', 'num_input_tokens_seen': 21626555, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.366', 'grad_norm': '1.97', 'learning_rate': '4.986e-05', 'epoch': '0.266', 'num_input_tokens_seen': 21628602, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.826', 'grad_norm': '1.177', 'learning_rate': '4.986e-05', 'epoch': '0.2661', 'num_input_tokens_seen': 21630649, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8247', 'grad_norm': '1.389', 'learning_rate': '4.986e-05', 'epoch': '0.2661', 'num_input_tokens_seen': 21632696, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5083', 'grad_norm': '1.137', 'learning_rate': '4.986e-05', 'epoch': '0.2661', 'num_input_tokens_seen': 21634743, 'train_runtime': '1.094e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6556', 'grad_norm': '0.956', 'learning_rate': '4.986e-05', 'epoch': '0.2661', 'num_input_tokens_seen': 21636790, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2355', 'grad_norm': '0.9036', 'learning_rate': '4.986e-05', 'epoch': '0.2662', 'num_input_tokens_seen': 21638837, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.004', 'grad_norm': '2.385', 'learning_rate': '4.986e-05', 'epoch': '0.2662', 'num_input_tokens_seen': 21640884, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7442', 'grad_norm': '1.293', 'learning_rate': '4.986e-05', 'epoch': '0.2662', 'num_input_tokens_seen': 21642931, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3918', 'grad_norm': '0.9962', 'learning_rate': '4.986e-05', 'epoch': '0.2662', 'num_input_tokens_seen': 21644978, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7416', 'grad_norm': '1.001', 'learning_rate': '4.986e-05', 'epoch': '0.2663', 'num_input_tokens_seen': 21647025, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.864', 'grad_norm': '1.338', 'learning_rate': '4.986e-05', 'epoch': '0.2663', 'num_input_tokens_seen': 21649072, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.154', 'grad_norm': '1.752', 'learning_rate': '4.986e-05', 'epoch': '0.2663', 'num_input_tokens_seen': 21651119, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4068', 'grad_norm': '0.9905', 'learning_rate': '4.986e-05', 'epoch': '0.2663', 'num_input_tokens_seen': 21653166, 'train_runtime': '1.095e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6885', 'grad_norm': '1.257', 'learning_rate': '4.986e-05', 'epoch': '0.2664', 'num_input_tokens_seen': 21655213, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4739', 'grad_norm': '1.249', 'learning_rate': '4.986e-05', 'epoch': '0.2664', 'num_input_tokens_seen': 21657260, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.174', 'grad_norm': '1.921', 'learning_rate': '4.986e-05', 'epoch': '0.2664', 'num_input_tokens_seen': 21659307, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2569', 'grad_norm': '0.8882', 'learning_rate': '4.986e-05', 'epoch': '0.2664', 'num_input_tokens_seen': 21661354, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.331', 'grad_norm': '0.8354', 'learning_rate': '4.986e-05', 'epoch': '0.2665', 'num_input_tokens_seen': 21663401, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6722', 'grad_norm': '1.199', 'learning_rate': '4.986e-05', 'epoch': '0.2665', 'num_input_tokens_seen': 21665448, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.069', 'grad_norm': '1.298', 'learning_rate': '4.986e-05', 'epoch': '0.2665', 'num_input_tokens_seen': 21667495, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3166', 'grad_norm': '0.836', 'learning_rate': '4.986e-05', 'epoch': '0.2665', 'num_input_tokens_seen': 21669542, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4493', 'grad_norm': '1.01', 'learning_rate': '4.986e-05', 'epoch': '0.2666', 'num_input_tokens_seen': 21671589, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.416', 'grad_norm': '2.399', 'learning_rate': '4.986e-05', 'epoch': '0.2666', 'num_input_tokens_seen': 21673636, 'train_runtime': '1.096e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4781', 'grad_norm': '1.087', 'learning_rate': '4.986e-05', 'epoch': '0.2666', 'num_input_tokens_seen': 21675683, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3502', 'grad_norm': '0.9343', 'learning_rate': '4.986e-05', 'epoch': '0.2666', 'num_input_tokens_seen': 21677730, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2151', 'grad_norm': '0.9903', 'learning_rate': '4.986e-05', 'epoch': '0.2667', 'num_input_tokens_seen': 21679777, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.264', 'grad_norm': '1.066', 'learning_rate': '4.986e-05', 'epoch': '0.2667', 'num_input_tokens_seen': 21681824, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6717', 'grad_norm': '0.9566', 'learning_rate': '4.986e-05', 'epoch': '0.2667', 'num_input_tokens_seen': 21683871, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5539', 'grad_norm': '4.189', 'learning_rate': '4.986e-05', 'epoch': '0.2667', 'num_input_tokens_seen': 21685918, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3801', 'grad_norm': '1.067', 'learning_rate': '4.986e-05', 'epoch': '0.2668', 'num_input_tokens_seen': 21687965, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8166', 'grad_norm': '1.428', 'learning_rate': '4.986e-05', 'epoch': '0.2668', 'num_input_tokens_seen': 21690012, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7447', 'grad_norm': '1.283', 'learning_rate': '4.986e-05', 'epoch': '0.2668', 'num_input_tokens_seen': 21692059, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.137', 'grad_norm': '1.946', 'learning_rate': '4.986e-05', 'epoch': '0.2668', 'num_input_tokens_seen': 21694106, 'train_runtime': '1.097e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8266', 'grad_norm': '1.415', 'learning_rate': '4.986e-05', 'epoch': '0.2669', 'num_input_tokens_seen': 21696153, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5988', 'grad_norm': '1.182', 'learning_rate': '4.986e-05', 'epoch': '0.2669', 'num_input_tokens_seen': 21698200, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4408', 'grad_norm': '1.091', 'learning_rate': '4.986e-05', 'epoch': '0.2669', 'num_input_tokens_seen': 21700247, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.643', 'grad_norm': '1.209', 'learning_rate': '4.986e-05', 'epoch': '0.2669', 'num_input_tokens_seen': 21702294, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3307', 'grad_norm': '0.868', 'learning_rate': '4.986e-05', 'epoch': '0.267', 'num_input_tokens_seen': 21704341, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5958', 'grad_norm': '1.234', 'learning_rate': '4.986e-05', 'epoch': '0.267', 'num_input_tokens_seen': 21706388, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.644', 'grad_norm': '1.05', 'learning_rate': '4.986e-05', 'epoch': '0.267', 'num_input_tokens_seen': 21708435, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.28', 'grad_norm': '2.362', 'learning_rate': '4.986e-05', 'epoch': '0.267', 'num_input_tokens_seen': 21710482, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3394', 'grad_norm': '1.051', 'learning_rate': '4.986e-05', 'epoch': '0.2671', 'num_input_tokens_seen': 21712529, 'train_runtime': '1.098e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6485', 'grad_norm': '1.338', 'learning_rate': '4.986e-05', 'epoch': '0.2671', 'num_input_tokens_seen': 21714576, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4215', 'grad_norm': '0.9174', 'learning_rate': '4.986e-05', 'epoch': '0.2671', 'num_input_tokens_seen': 21716623, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7259', 'grad_norm': '1.848', 'learning_rate': '4.986e-05', 'epoch': '0.2671', 'num_input_tokens_seen': 21718670, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2984', 'grad_norm': '0.85', 'learning_rate': '4.986e-05', 'epoch': '0.2672', 'num_input_tokens_seen': 21720717, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4939', 'grad_norm': '0.9615', 'learning_rate': '4.986e-05', 'epoch': '0.2672', 'num_input_tokens_seen': 21722764, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6773', 'grad_norm': '1.271', 'learning_rate': '4.986e-05', 'epoch': '0.2672', 'num_input_tokens_seen': 21724811, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2146', 'grad_norm': '0.779', 'learning_rate': '4.986e-05', 'epoch': '0.2672', 'num_input_tokens_seen': 21726858, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.673', 'grad_norm': '1.029', 'learning_rate': '4.986e-05', 'epoch': '0.2673', 'num_input_tokens_seen': 21728905, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7811', 'grad_norm': '1.273', 'learning_rate': '4.986e-05', 'epoch': '0.2673', 'num_input_tokens_seen': 21730952, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8117', 'grad_norm': '1.334', 'learning_rate': '4.986e-05', 'epoch': '0.2673', 'num_input_tokens_seen': 21732999, 'train_runtime': '1.099e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.379', 'grad_norm': '2.202', 'learning_rate': '4.986e-05', 'epoch': '0.2673', 'num_input_tokens_seen': 21735046, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6389', 'grad_norm': '1.218', 'learning_rate': '4.986e-05', 'epoch': '0.2674', 'num_input_tokens_seen': 21737093, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8859', 'grad_norm': '1.39', 'learning_rate': '4.986e-05', 'epoch': '0.2674', 'num_input_tokens_seen': 21739140, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5712', 'grad_norm': '1.049', 'learning_rate': '4.986e-05', 'epoch': '0.2674', 'num_input_tokens_seen': 21741187, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4181', 'grad_norm': '0.8432', 'learning_rate': '4.986e-05', 'epoch': '0.2674', 'num_input_tokens_seen': 21743234, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.433', 'grad_norm': '1.027', 'learning_rate': '4.986e-05', 'epoch': '0.2675', 'num_input_tokens_seen': 21745281, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.671', 'grad_norm': '1.337', 'learning_rate': '4.986e-05', 'epoch': '0.2675', 'num_input_tokens_seen': 21747328, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8581', 'grad_norm': '1.494', 'learning_rate': '4.986e-05', 'epoch': '0.2675', 'num_input_tokens_seen': 21749375, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7394', 'grad_norm': '1.269', 'learning_rate': '4.986e-05', 'epoch': '0.2675', 'num_input_tokens_seen': 21751422, 'train_runtime': '1.1e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6185', 'grad_norm': '1.43', 'learning_rate': '4.986e-05', 'epoch': '0.2676', 'num_input_tokens_seen': 21753469, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8305', 'grad_norm': '1.132', 'learning_rate': '4.986e-05', 'epoch': '0.2676', 'num_input_tokens_seen': 21755516, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9043', 'grad_norm': '1.892', 'learning_rate': '4.986e-05', 'epoch': '0.2676', 'num_input_tokens_seen': 21757563, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1914', 'grad_norm': '0.7733', 'learning_rate': '4.986e-05', 'epoch': '0.2676', 'num_input_tokens_seen': 21759610, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3662', 'grad_norm': '0.7015', 'learning_rate': '4.986e-05', 'epoch': '0.2677', 'num_input_tokens_seen': 21761657, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2995', 'grad_norm': '0.8442', 'learning_rate': '4.986e-05', 'epoch': '0.2677', 'num_input_tokens_seen': 21763704, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8502', 'grad_norm': '1.213', 'learning_rate': '4.986e-05', 'epoch': '0.2677', 'num_input_tokens_seen': 21765751, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.03', 'grad_norm': '1.984', 'learning_rate': '4.986e-05', 'epoch': '0.2677', 'num_input_tokens_seen': 21767798, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7746', 'grad_norm': '1.248', 'learning_rate': '4.986e-05', 'epoch': '0.2678', 'num_input_tokens_seen': 21769845, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2702', 'grad_norm': '0.9057', 'learning_rate': '4.986e-05', 'epoch': '0.2678', 'num_input_tokens_seen': 21771892, 'train_runtime': '1.101e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.435', 'grad_norm': '0.8684', 'learning_rate': '4.986e-05', 'epoch': '0.2678', 'num_input_tokens_seen': 21773939, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9054', 'grad_norm': '1.472', 'learning_rate': '4.986e-05', 'epoch': '0.2678', 'num_input_tokens_seen': 21775986, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3088', 'grad_norm': '0.8091', 'learning_rate': '4.986e-05', 'epoch': '0.2679', 'num_input_tokens_seen': 21778033, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7291', 'grad_norm': '1.352', 'learning_rate': '4.986e-05', 'epoch': '0.2679', 'num_input_tokens_seen': 21780080, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8742', 'grad_norm': '1.167', 'learning_rate': '4.986e-05', 'epoch': '0.2679', 'num_input_tokens_seen': 21782127, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9295', 'grad_norm': '1.439', 'learning_rate': '4.986e-05', 'epoch': '0.2679', 'num_input_tokens_seen': 21784174, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6623', 'grad_norm': '1.391', 'learning_rate': '4.986e-05', 'epoch': '0.268', 'num_input_tokens_seen': 21786221, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.614', 'grad_norm': '3.189', 'learning_rate': '4.986e-05', 'epoch': '0.268', 'num_input_tokens_seen': 21788268, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.452', 'grad_norm': '2.33', 'learning_rate': '4.986e-05', 'epoch': '0.268', 'num_input_tokens_seen': 21790315, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.13', 'grad_norm': '2.125', 'learning_rate': '4.986e-05', 'epoch': '0.268', 'num_input_tokens_seen': 21792362, 'train_runtime': '1.102e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.634', 'grad_norm': '2.214', 'learning_rate': '4.986e-05', 'epoch': '0.2681', 'num_input_tokens_seen': 21794409, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5558', 'grad_norm': '1.219', 'learning_rate': '4.986e-05', 'epoch': '0.2681', 'num_input_tokens_seen': 21796456, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.301', 'grad_norm': '2.03', 'learning_rate': '4.985e-05', 'epoch': '0.2681', 'num_input_tokens_seen': 21798503, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5573', 'grad_norm': '1.136', 'learning_rate': '4.985e-05', 'epoch': '0.2681', 'num_input_tokens_seen': 21800550, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.491', 'grad_norm': '2.261', 'learning_rate': '4.985e-05', 'epoch': '0.2682', 'num_input_tokens_seen': 21802597, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3615', 'grad_norm': '0.9013', 'learning_rate': '4.985e-05', 'epoch': '0.2682', 'num_input_tokens_seen': 21804644, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3743', 'grad_norm': '0.8908', 'learning_rate': '4.985e-05', 'epoch': '0.2682', 'num_input_tokens_seen': 21806691, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4956', 'grad_norm': '1.43', 'learning_rate': '4.985e-05', 'epoch': '0.2682', 'num_input_tokens_seen': 21808738, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5984', 'grad_norm': '1.074', 'learning_rate': '4.985e-05', 'epoch': '0.2683', 'num_input_tokens_seen': 21810785, 'train_runtime': '1.103e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4665', 'grad_norm': '0.9497', 'learning_rate': '4.985e-05', 'epoch': '0.2683', 'num_input_tokens_seen': 21812832, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8519', 'grad_norm': '1.688', 'learning_rate': '4.985e-05', 'epoch': '0.2683', 'num_input_tokens_seen': 21814879, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3368', 'grad_norm': '0.8792', 'learning_rate': '4.985e-05', 'epoch': '0.2683', 'num_input_tokens_seen': 21816926, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.607', 'grad_norm': '2.931', 'learning_rate': '4.985e-05', 'epoch': '0.2684', 'num_input_tokens_seen': 21818973, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3278', 'grad_norm': '0.8223', 'learning_rate': '4.985e-05', 'epoch': '0.2684', 'num_input_tokens_seen': 21821020, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4065', 'grad_norm': '0.8951', 'learning_rate': '4.985e-05', 'epoch': '0.2684', 'num_input_tokens_seen': 21823067, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2758', 'grad_norm': '0.8574', 'learning_rate': '4.985e-05', 'epoch': '0.2684', 'num_input_tokens_seen': 21825114, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7874', 'grad_norm': '1.369', 'learning_rate': '4.985e-05', 'epoch': '0.2685', 'num_input_tokens_seen': 21827161, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3204', 'grad_norm': '0.853', 'learning_rate': '4.985e-05', 'epoch': '0.2685', 'num_input_tokens_seen': 21829208, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5028', 'grad_norm': '1.239', 'learning_rate': '4.985e-05', 'epoch': '0.2685', 'num_input_tokens_seen': 21831255, 'train_runtime': '1.104e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6098', 'grad_norm': '1.316', 'learning_rate': '4.985e-05', 'epoch': '0.2685', 'num_input_tokens_seen': 21833302, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6904', 'grad_norm': '1.285', 'learning_rate': '4.985e-05', 'epoch': '0.2686', 'num_input_tokens_seen': 21835349, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8866', 'grad_norm': '1.22', 'learning_rate': '4.985e-05', 'epoch': '0.2686', 'num_input_tokens_seen': 21837396, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.556', 'grad_norm': '1.314', 'learning_rate': '4.985e-05', 'epoch': '0.2686', 'num_input_tokens_seen': 21839443, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8708', 'grad_norm': '1.462', 'learning_rate': '4.985e-05', 'epoch': '0.2687', 'num_input_tokens_seen': 21841490, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3336', 'grad_norm': '0.9086', 'learning_rate': '4.985e-05', 'epoch': '0.2687', 'num_input_tokens_seen': 21843537, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.829', 'grad_norm': '2.325', 'learning_rate': '4.985e-05', 'epoch': '0.2687', 'num_input_tokens_seen': 21845584, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.332', 'grad_norm': '2.113', 'learning_rate': '4.985e-05', 'epoch': '0.2687', 'num_input_tokens_seen': 21847631, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7822', 'grad_norm': '1.347', 'learning_rate': '4.985e-05', 'epoch': '0.2688', 'num_input_tokens_seen': 21849678, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4279', 'grad_norm': '0.9361', 'learning_rate': '4.985e-05', 'epoch': '0.2688', 'num_input_tokens_seen': 21851725, 'train_runtime': '1.105e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6382', 'grad_norm': '1.422', 'learning_rate': '4.985e-05', 'epoch': '0.2688', 'num_input_tokens_seen': 21853772, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.198', 'grad_norm': '0.9937', 'learning_rate': '4.985e-05', 'epoch': '0.2688', 'num_input_tokens_seen': 21855819, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4274', 'grad_norm': '0.9258', 'learning_rate': '4.985e-05', 'epoch': '0.2689', 'num_input_tokens_seen': 21857866, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.084', 'grad_norm': '2.102', 'learning_rate': '4.985e-05', 'epoch': '0.2689', 'num_input_tokens_seen': 21859913, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1918', 'grad_norm': '0.7488', 'learning_rate': '4.985e-05', 'epoch': '0.2689', 'num_input_tokens_seen': 21861960, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.066', 'grad_norm': '1.764', 'learning_rate': '4.985e-05', 'epoch': '0.2689', 'num_input_tokens_seen': 21864007, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4636', 'grad_norm': '0.95', 'learning_rate': '4.985e-05', 'epoch': '0.269', 'num_input_tokens_seen': 21866054, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.108', 'grad_norm': '1.563', 'learning_rate': '4.985e-05', 'epoch': '0.269', 'num_input_tokens_seen': 21868101, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.569', 'grad_norm': '2.227', 'learning_rate': '4.985e-05', 'epoch': '0.269', 'num_input_tokens_seen': 21870148, 'train_runtime': '1.106e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5517', 'grad_norm': '1.465', 'learning_rate': '4.985e-05', 'epoch': '0.269', 'num_input_tokens_seen': 21872195, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.725', 'grad_norm': '1.351', 'learning_rate': '4.985e-05', 'epoch': '0.2691', 'num_input_tokens_seen': 21874242, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4544', 'grad_norm': '1.081', 'learning_rate': '4.985e-05', 'epoch': '0.2691', 'num_input_tokens_seen': 21876289, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7351', 'grad_norm': '1.135', 'learning_rate': '4.985e-05', 'epoch': '0.2691', 'num_input_tokens_seen': 21878336, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4068', 'grad_norm': '0.9391', 'learning_rate': '4.985e-05', 'epoch': '0.2691', 'num_input_tokens_seen': 21880383, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4003', 'grad_norm': '0.8923', 'learning_rate': '4.985e-05', 'epoch': '0.2692', 'num_input_tokens_seen': 21882430, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7966', 'grad_norm': '1.533', 'learning_rate': '4.985e-05', 'epoch': '0.2692', 'num_input_tokens_seen': 21884477, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9933', 'grad_norm': '2.194', 'learning_rate': '4.985e-05', 'epoch': '0.2692', 'num_input_tokens_seen': 21886524, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.141', 'grad_norm': '2.367', 'learning_rate': '4.985e-05', 'epoch': '0.2692', 'num_input_tokens_seen': 21888571, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4632', 'grad_norm': '1.25', 'learning_rate': '4.985e-05', 'epoch': '0.2693', 'num_input_tokens_seen': 21890618, 'train_runtime': '1.107e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8178', 'grad_norm': '1.293', 'learning_rate': '4.985e-05', 'epoch': '0.2693', 'num_input_tokens_seen': 21892665, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3848', 'grad_norm': '1.17', 'learning_rate': '4.985e-05', 'epoch': '0.2693', 'num_input_tokens_seen': 21894712, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7525', 'grad_norm': '1.093', 'learning_rate': '4.985e-05', 'epoch': '0.2693', 'num_input_tokens_seen': 21896759, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6638', 'grad_norm': '1.219', 'learning_rate': '4.985e-05', 'epoch': '0.2694', 'num_input_tokens_seen': 21898806, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4945', 'grad_norm': '1.26', 'learning_rate': '4.985e-05', 'epoch': '0.2694', 'num_input_tokens_seen': 21900853, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.032', 'grad_norm': '2.11', 'learning_rate': '4.985e-05', 'epoch': '0.2694', 'num_input_tokens_seen': 21902900, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5361', 'grad_norm': '1.374', 'learning_rate': '4.985e-05', 'epoch': '0.2694', 'num_input_tokens_seen': 21904947, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5826', 'grad_norm': '1.205', 'learning_rate': '4.985e-05', 'epoch': '0.2695', 'num_input_tokens_seen': 21906994, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2579', 'grad_norm': '1.03', 'learning_rate': '4.985e-05', 'epoch': '0.2695', 'num_input_tokens_seen': 21909041, 'train_runtime': '1.108e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8198', 'grad_norm': '1.745', 'learning_rate': '4.985e-05', 'epoch': '0.2695', 'num_input_tokens_seen': 21911088, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7694', 'grad_norm': '1.203', 'learning_rate': '4.985e-05', 'epoch': '0.2695', 'num_input_tokens_seen': 21913135, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.828', 'grad_norm': '1.691', 'learning_rate': '4.985e-05', 'epoch': '0.2696', 'num_input_tokens_seen': 21915182, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4096', 'grad_norm': '1.084', 'learning_rate': '4.985e-05', 'epoch': '0.2696', 'num_input_tokens_seen': 21917229, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.921', 'grad_norm': '2.243', 'learning_rate': '4.985e-05', 'epoch': '0.2696', 'num_input_tokens_seen': 21919276, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3751', 'grad_norm': '0.8551', 'learning_rate': '4.985e-05', 'epoch': '0.2696', 'num_input_tokens_seen': 21921323, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.77', 'grad_norm': '1.294', 'learning_rate': '4.985e-05', 'epoch': '0.2697', 'num_input_tokens_seen': 21923370, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.322', 'grad_norm': '0.8639', 'learning_rate': '4.985e-05', 'epoch': '0.2697', 'num_input_tokens_seen': 21925417, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8091', 'grad_norm': '1.406', 'learning_rate': '4.985e-05', 'epoch': '0.2697', 'num_input_tokens_seen': 21927464, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.052', 'grad_norm': '1.675', 'learning_rate': '4.985e-05', 'epoch': '0.2697', 'num_input_tokens_seen': 21929511, 'train_runtime': '1.109e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.984', 'grad_norm': '2.767', 'learning_rate': '4.985e-05', 'epoch': '0.2698', 'num_input_tokens_seen': 21931558, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.026', 'grad_norm': '1.621', 'learning_rate': '4.985e-05', 'epoch': '0.2698', 'num_input_tokens_seen': 21933605, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8536', 'grad_norm': '1.581', 'learning_rate': '4.985e-05', 'epoch': '0.2698', 'num_input_tokens_seen': 21935652, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.72', 'grad_norm': '2.695', 'learning_rate': '4.985e-05', 'epoch': '0.2698', 'num_input_tokens_seen': 21937699, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8305', 'grad_norm': '1.674', 'learning_rate': '4.985e-05', 'epoch': '0.2699', 'num_input_tokens_seen': 21939746, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4079', 'grad_norm': '0.9842', 'learning_rate': '4.985e-05', 'epoch': '0.2699', 'num_input_tokens_seen': 21941793, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.194', 'grad_norm': '1.851', 'learning_rate': '4.985e-05', 'epoch': '0.2699', 'num_input_tokens_seen': 21943840, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5138', 'grad_norm': '1.374', 'learning_rate': '4.985e-05', 'epoch': '0.2699', 'num_input_tokens_seen': 21945887, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3176', 'grad_norm': '0.8386', 'learning_rate': '4.985e-05', 'epoch': '0.27', 'num_input_tokens_seen': 21947934, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6513', 'grad_norm': '1.199', 'learning_rate': '4.985e-05', 'epoch': '0.27', 'num_input_tokens_seen': 21949981, 'train_runtime': '1.11e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6292', 'grad_norm': '1.342', 'learning_rate': '4.985e-05', 'epoch': '0.27', 'num_input_tokens_seen': 21952028, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5906', 'grad_norm': '1.141', 'learning_rate': '4.985e-05', 'epoch': '0.27', 'num_input_tokens_seen': 21954075, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3925', 'grad_norm': '0.8396', 'learning_rate': '4.985e-05', 'epoch': '0.2701', 'num_input_tokens_seen': 21956122, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3692', 'grad_norm': '0.7415', 'learning_rate': '4.985e-05', 'epoch': '0.2701', 'num_input_tokens_seen': 21958169, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6713', 'grad_norm': '1.327', 'learning_rate': '4.985e-05', 'epoch': '0.2701', 'num_input_tokens_seen': 21960216, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.061', 'grad_norm': '1.961', 'learning_rate': '4.985e-05', 'epoch': '0.2701', 'num_input_tokens_seen': 21962263, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.105', 'grad_norm': '1.889', 'learning_rate': '4.985e-05', 'epoch': '0.2702', 'num_input_tokens_seen': 21964310, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5165', 'grad_norm': '0.882', 'learning_rate': '4.985e-05', 'epoch': '0.2702', 'num_input_tokens_seen': 21966357, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6966', 'grad_norm': '1.512', 'learning_rate': '4.985e-05', 'epoch': '0.2702', 'num_input_tokens_seen': 21968404, 'train_runtime': '1.111e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4592', 'grad_norm': '0.9306', 'learning_rate': '4.985e-05', 'epoch': '0.2702', 'num_input_tokens_seen': 21970451, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.095', 'grad_norm': '2.244', 'learning_rate': '4.985e-05', 'epoch': '0.2703', 'num_input_tokens_seen': 21972498, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1169', 'grad_norm': '0.7193', 'learning_rate': '4.985e-05', 'epoch': '0.2703', 'num_input_tokens_seen': 21974545, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4558', 'grad_norm': '1.293', 'learning_rate': '4.985e-05', 'epoch': '0.2703', 'num_input_tokens_seen': 21976592, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.508', 'grad_norm': '2.377', 'learning_rate': '4.985e-05', 'epoch': '0.2703', 'num_input_tokens_seen': 21978639, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5218', 'grad_norm': '1.217', 'learning_rate': '4.985e-05', 'epoch': '0.2704', 'num_input_tokens_seen': 21980686, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5592', 'grad_norm': '1.408', 'learning_rate': '4.985e-05', 'epoch': '0.2704', 'num_input_tokens_seen': 21982733, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.332', 'grad_norm': '2.126', 'learning_rate': '4.985e-05', 'epoch': '0.2704', 'num_input_tokens_seen': 21984780, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7719', 'grad_norm': '1.727', 'learning_rate': '4.985e-05', 'epoch': '0.2704', 'num_input_tokens_seen': 21986827, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4095', 'grad_norm': '0.8875', 'learning_rate': '4.985e-05', 'epoch': '0.2705', 'num_input_tokens_seen': 21988874, 'train_runtime': '1.112e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9375', 'grad_norm': '1.701', 'learning_rate': '4.985e-05', 'epoch': '0.2705', 'num_input_tokens_seen': 21990921, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7553', 'grad_norm': '1.327', 'learning_rate': '4.985e-05', 'epoch': '0.2705', 'num_input_tokens_seen': 21992968, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9175', 'grad_norm': '1.554', 'learning_rate': '4.985e-05', 'epoch': '0.2705', 'num_input_tokens_seen': 21995015, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6244', 'grad_norm': '1.526', 'learning_rate': '4.985e-05', 'epoch': '0.2706', 'num_input_tokens_seen': 21997062, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4319', 'grad_norm': '1.198', 'learning_rate': '4.985e-05', 'epoch': '0.2706', 'num_input_tokens_seen': 21999109, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8382', 'grad_norm': '1.331', 'learning_rate': '4.985e-05', 'epoch': '0.2706', 'num_input_tokens_seen': 22001156, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9636', 'grad_norm': '1.528', 'learning_rate': '4.985e-05', 'epoch': '0.2706', 'num_input_tokens_seen': 22003203, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8368', 'grad_norm': '1.562', 'learning_rate': '4.985e-05', 'epoch': '0.2707', 'num_input_tokens_seen': 22005250, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4208', 'grad_norm': '1.045', 'learning_rate': '4.985e-05', 'epoch': '0.2707', 'num_input_tokens_seen': 22007297, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6512', 'grad_norm': '1.506', 'learning_rate': '4.985e-05', 'epoch': '0.2707', 'num_input_tokens_seen': 22009344, 'train_runtime': '1.113e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4245', 'grad_norm': '0.949', 'learning_rate': '4.985e-05', 'epoch': '0.2707', 'num_input_tokens_seen': 22011391, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.276', 'grad_norm': '2.016', 'learning_rate': '4.985e-05', 'epoch': '0.2708', 'num_input_tokens_seen': 22013438, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.372', 'learning_rate': '4.985e-05', 'epoch': '0.2708', 'num_input_tokens_seen': 22015485, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5619', 'grad_norm': '1.304', 'learning_rate': '4.985e-05', 'epoch': '0.2708', 'num_input_tokens_seen': 22017532, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.105', 'grad_norm': '2.095', 'learning_rate': '4.985e-05', 'epoch': '0.2708', 'num_input_tokens_seen': 22019579, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5828', 'grad_norm': '1.278', 'learning_rate': '4.985e-05', 'epoch': '0.2709', 'num_input_tokens_seen': 22021626, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.414', 'grad_norm': '2.16', 'learning_rate': '4.985e-05', 'epoch': '0.2709', 'num_input_tokens_seen': 22023673, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3774', 'grad_norm': '0.926', 'learning_rate': '4.985e-05', 'epoch': '0.2709', 'num_input_tokens_seen': 22025720, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6503', 'grad_norm': '1.218', 'learning_rate': '4.985e-05', 'epoch': '0.2709', 'num_input_tokens_seen': 22027767, 'train_runtime': '1.114e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9256', 'grad_norm': '1.539', 'learning_rate': '4.985e-05', 'epoch': '0.271', 'num_input_tokens_seen': 22029814, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5287', 'grad_norm': '1.299', 'learning_rate': '4.985e-05', 'epoch': '0.271', 'num_input_tokens_seen': 22031861, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6935', 'grad_norm': '1.276', 'learning_rate': '4.985e-05', 'epoch': '0.271', 'num_input_tokens_seen': 22033908, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2052', 'grad_norm': '0.7515', 'learning_rate': '4.985e-05', 'epoch': '0.271', 'num_input_tokens_seen': 22035955, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5978', 'grad_norm': '1.356', 'learning_rate': '4.985e-05', 'epoch': '0.2711', 'num_input_tokens_seen': 22038002, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8548', 'grad_norm': '1.383', 'learning_rate': '4.985e-05', 'epoch': '0.2711', 'num_input_tokens_seen': 22040049, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5561', 'grad_norm': '1.293', 'learning_rate': '4.985e-05', 'epoch': '0.2711', 'num_input_tokens_seen': 22042096, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3579', 'grad_norm': '0.9972', 'learning_rate': '4.985e-05', 'epoch': '0.2711', 'num_input_tokens_seen': 22044143, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8891', 'grad_norm': '1.475', 'learning_rate': '4.985e-05', 'epoch': '0.2712', 'num_input_tokens_seen': 22046190, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5736', 'grad_norm': '1.178', 'learning_rate': '4.985e-05', 'epoch': '0.2712', 'num_input_tokens_seen': 22048237, 'train_runtime': '1.115e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.008', 'grad_norm': '1.486', 'learning_rate': '4.985e-05', 'epoch': '0.2712', 'num_input_tokens_seen': 22050284, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2988', 'grad_norm': '0.7967', 'learning_rate': '4.985e-05', 'epoch': '0.2712', 'num_input_tokens_seen': 22052331, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3375', 'grad_norm': '0.8782', 'learning_rate': '4.985e-05', 'epoch': '0.2713', 'num_input_tokens_seen': 22054378, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.191', 'grad_norm': '1.972', 'learning_rate': '4.985e-05', 'epoch': '0.2713', 'num_input_tokens_seen': 22056425, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5182', 'grad_norm': '1.259', 'learning_rate': '4.985e-05', 'epoch': '0.2713', 'num_input_tokens_seen': 22058472, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.078', 'grad_norm': '2.116', 'learning_rate': '4.985e-05', 'epoch': '0.2713', 'num_input_tokens_seen': 22060519, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7337', 'grad_norm': '1.104', 'learning_rate': '4.985e-05', 'epoch': '0.2714', 'num_input_tokens_seen': 22062566, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5191', 'grad_norm': '1.27', 'learning_rate': '4.985e-05', 'epoch': '0.2714', 'num_input_tokens_seen': 22064613, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4321', 'grad_norm': '1.1', 'learning_rate': '4.985e-05', 'epoch': '0.2714', 'num_input_tokens_seen': 22066660, 'train_runtime': '1.116e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8035', 'grad_norm': '1.975', 'learning_rate': '4.985e-05', 'epoch': '0.2714', 'num_input_tokens_seen': 22068707, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6189', 'grad_norm': '1.347', 'learning_rate': '4.985e-05', 'epoch': '0.2715', 'num_input_tokens_seen': 22070754, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9975', 'grad_norm': '1.864', 'learning_rate': '4.985e-05', 'epoch': '0.2715', 'num_input_tokens_seen': 22072801, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.671', 'grad_norm': '1.419', 'learning_rate': '4.985e-05', 'epoch': '0.2715', 'num_input_tokens_seen': 22074848, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.184', 'grad_norm': '1.031', 'learning_rate': '4.985e-05', 'epoch': '0.2715', 'num_input_tokens_seen': 22076895, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6151', 'grad_norm': '1.02', 'learning_rate': '4.985e-05', 'epoch': '0.2716', 'num_input_tokens_seen': 22078942, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5049', 'grad_norm': '1.131', 'learning_rate': '4.985e-05', 'epoch': '0.2716', 'num_input_tokens_seen': 22080989, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.875', 'grad_norm': '1.159', 'learning_rate': '4.985e-05', 'epoch': '0.2716', 'num_input_tokens_seen': 22083036, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.027', 'grad_norm': '1.996', 'learning_rate': '4.985e-05', 'epoch': '0.2716', 'num_input_tokens_seen': 22085083, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2337', 'grad_norm': '0.9675', 'learning_rate': '4.985e-05', 'epoch': '0.2717', 'num_input_tokens_seen': 22087130, 'train_runtime': '1.117e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9444', 'grad_norm': '1.346', 'learning_rate': '4.985e-05', 'epoch': '0.2717', 'num_input_tokens_seen': 22089177, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5907', 'grad_norm': '1.062', 'learning_rate': '4.985e-05', 'epoch': '0.2717', 'num_input_tokens_seen': 22091224, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4434', 'grad_norm': '1.17', 'learning_rate': '4.985e-05', 'epoch': '0.2717', 'num_input_tokens_seen': 22093271, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4525', 'grad_norm': '1.235', 'learning_rate': '4.985e-05', 'epoch': '0.2718', 'num_input_tokens_seen': 22095318, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '1.84', 'learning_rate': '4.985e-05', 'epoch': '0.2718', 'num_input_tokens_seen': 22097365, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5536', 'grad_norm': '1.267', 'learning_rate': '4.985e-05', 'epoch': '0.2718', 'num_input_tokens_seen': 22099412, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5156', 'grad_norm': '1.263', 'learning_rate': '4.985e-05', 'epoch': '0.2718', 'num_input_tokens_seen': 22101459, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4884', 'grad_norm': '1.146', 'learning_rate': '4.985e-05', 'epoch': '0.2719', 'num_input_tokens_seen': 22103506, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5236', 'grad_norm': '1.348', 'learning_rate': '4.985e-05', 'epoch': '0.2719', 'num_input_tokens_seen': 22105553, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.205', 'grad_norm': '2.049', 'learning_rate': '4.985e-05', 'epoch': '0.2719', 'num_input_tokens_seen': 22107600, 'train_runtime': '1.118e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.131', 'grad_norm': '1.799', 'learning_rate': '4.985e-05', 'epoch': '0.2719', 'num_input_tokens_seen': 22109647, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6735', 'grad_norm': '1.068', 'learning_rate': '4.985e-05', 'epoch': '0.272', 'num_input_tokens_seen': 22111694, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5613', 'grad_norm': '1.315', 'learning_rate': '4.985e-05', 'epoch': '0.272', 'num_input_tokens_seen': 22113741, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.099', 'grad_norm': '1.923', 'learning_rate': '4.985e-05', 'epoch': '0.272', 'num_input_tokens_seen': 22115788, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6605', 'grad_norm': '1.15', 'learning_rate': '4.985e-05', 'epoch': '0.272', 'num_input_tokens_seen': 22117835, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5612', 'grad_norm': '1.068', 'learning_rate': '4.985e-05', 'epoch': '0.2721', 'num_input_tokens_seen': 22119882, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.693', 'grad_norm': '2.851', 'learning_rate': '4.985e-05', 'epoch': '0.2721', 'num_input_tokens_seen': 22121929, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.62', 'grad_norm': '2.584', 'learning_rate': '4.985e-05', 'epoch': '0.2721', 'num_input_tokens_seen': 22123976, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5811', 'grad_norm': '1.041', 'learning_rate': '4.985e-05', 'epoch': '0.2722', 'num_input_tokens_seen': 22126023, 'train_runtime': '1.119e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6908', 'grad_norm': '1.211', 'learning_rate': '4.985e-05', 'epoch': '0.2722', 'num_input_tokens_seen': 22128070, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9824', 'grad_norm': '1.457', 'learning_rate': '4.985e-05', 'epoch': '0.2722', 'num_input_tokens_seen': 22130117, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.221', 'grad_norm': '0.8696', 'learning_rate': '4.985e-05', 'epoch': '0.2722', 'num_input_tokens_seen': 22132164, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4666', 'grad_norm': '1.303', 'learning_rate': '4.985e-05', 'epoch': '0.2723', 'num_input_tokens_seen': 22134211, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5221', 'grad_norm': '1.295', 'learning_rate': '4.985e-05', 'epoch': '0.2723', 'num_input_tokens_seen': 22136258, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3112', 'grad_norm': '0.9', 'learning_rate': '4.985e-05', 'epoch': '0.2723', 'num_input_tokens_seen': 22138305, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7039', 'grad_norm': '1.197', 'learning_rate': '4.985e-05', 'epoch': '0.2723', 'num_input_tokens_seen': 22140352, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3848', 'grad_norm': '0.9492', 'learning_rate': '4.985e-05', 'epoch': '0.2724', 'num_input_tokens_seen': 22142399, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4627', 'grad_norm': '1.001', 'learning_rate': '4.985e-05', 'epoch': '0.2724', 'num_input_tokens_seen': 22144446, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1856', 'grad_norm': '0.84', 'learning_rate': '4.985e-05', 'epoch': '0.2724', 'num_input_tokens_seen': 22146493, 'train_runtime': '1.12e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5784', 'grad_norm': '1.291', 'learning_rate': '4.985e-05', 'epoch': '0.2724', 'num_input_tokens_seen': 22148540, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8619', 'grad_norm': '0.994', 'learning_rate': '4.985e-05', 'epoch': '0.2725', 'num_input_tokens_seen': 22150587, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5435', 'grad_norm': '1.349', 'learning_rate': '4.985e-05', 'epoch': '0.2725', 'num_input_tokens_seen': 22152634, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5832', 'grad_norm': '1.273', 'learning_rate': '4.985e-05', 'epoch': '0.2725', 'num_input_tokens_seen': 22154681, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2507', 'grad_norm': '0.9289', 'learning_rate': '4.985e-05', 'epoch': '0.2725', 'num_input_tokens_seen': 22156728, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.418', 'grad_norm': '0.9458', 'learning_rate': '4.985e-05', 'epoch': '0.2726', 'num_input_tokens_seen': 22158775, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7491', 'grad_norm': '1.295', 'learning_rate': '4.985e-05', 'epoch': '0.2726', 'num_input_tokens_seen': 22160822, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2408', 'grad_norm': '0.8239', 'learning_rate': '4.985e-05', 'epoch': '0.2726', 'num_input_tokens_seen': 22162869, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7291', 'grad_norm': '1.435', 'learning_rate': '4.985e-05', 'epoch': '0.2726', 'num_input_tokens_seen': 22164916, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4417', 'grad_norm': '0.9904', 'learning_rate': '4.985e-05', 'epoch': '0.2727', 'num_input_tokens_seen': 22166963, 'train_runtime': '1.121e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.681', 'grad_norm': '1.214', 'learning_rate': '4.985e-05', 'epoch': '0.2727', 'num_input_tokens_seen': 22169010, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.119', 'grad_norm': '2.061', 'learning_rate': '4.985e-05', 'epoch': '0.2727', 'num_input_tokens_seen': 22171057, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7871', 'grad_norm': '1.724', 'learning_rate': '4.985e-05', 'epoch': '0.2727', 'num_input_tokens_seen': 22173104, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.521', 'grad_norm': '2.624', 'learning_rate': '4.985e-05', 'epoch': '0.2728', 'num_input_tokens_seen': 22175151, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.254', 'grad_norm': '1.975', 'learning_rate': '4.985e-05', 'epoch': '0.2728', 'num_input_tokens_seen': 22177198, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4356', 'grad_norm': '0.928', 'learning_rate': '4.985e-05', 'epoch': '0.2728', 'num_input_tokens_seen': 22179245, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2798', 'grad_norm': '0.9691', 'learning_rate': '4.985e-05', 'epoch': '0.2728', 'num_input_tokens_seen': 22181292, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3932', 'grad_norm': '0.9695', 'learning_rate': '4.985e-05', 'epoch': '0.2729', 'num_input_tokens_seen': 22183339, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1647', 'grad_norm': '0.6988', 'learning_rate': '4.985e-05', 'epoch': '0.2729', 'num_input_tokens_seen': 22185386, 'train_runtime': '1.122e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7177', 'grad_norm': '1.459', 'learning_rate': '4.985e-05', 'epoch': '0.2729', 'num_input_tokens_seen': 22187433, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.706', 'grad_norm': '1.394', 'learning_rate': '4.985e-05', 'epoch': '0.2729', 'num_input_tokens_seen': 22189480, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9448', 'grad_norm': '1.859', 'learning_rate': '4.985e-05', 'epoch': '0.273', 'num_input_tokens_seen': 22191527, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7652', 'grad_norm': '0.9936', 'learning_rate': '4.985e-05', 'epoch': '0.273', 'num_input_tokens_seen': 22193574, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4014', 'grad_norm': '0.8864', 'learning_rate': '4.985e-05', 'epoch': '0.273', 'num_input_tokens_seen': 22195621, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3958', 'grad_norm': '1.045', 'learning_rate': '4.985e-05', 'epoch': '0.273', 'num_input_tokens_seen': 22197668, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8577', 'grad_norm': '2.113', 'learning_rate': '4.985e-05', 'epoch': '0.2731', 'num_input_tokens_seen': 22199715, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.556', 'grad_norm': '2.577', 'learning_rate': '4.985e-05', 'epoch': '0.2731', 'num_input_tokens_seen': 22201762, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2236', 'grad_norm': '0.8813', 'learning_rate': '4.985e-05', 'epoch': '0.2731', 'num_input_tokens_seen': 22203809, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8008', 'grad_norm': '1.091', 'learning_rate': '4.985e-05', 'epoch': '0.2731', 'num_input_tokens_seen': 22205856, 'train_runtime': '1.123e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.133', 'grad_norm': '1.673', 'learning_rate': '4.985e-05', 'epoch': '0.2732', 'num_input_tokens_seen': 22207903, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1686', 'grad_norm': '0.8404', 'learning_rate': '4.985e-05', 'epoch': '0.2732', 'num_input_tokens_seen': 22209950, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7347', 'grad_norm': '1.127', 'learning_rate': '4.985e-05', 'epoch': '0.2732', 'num_input_tokens_seen': 22211997, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.036', 'grad_norm': '1.72', 'learning_rate': '4.985e-05', 'epoch': '0.2732', 'num_input_tokens_seen': 22214044, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7825', 'grad_norm': '1.413', 'learning_rate': '4.985e-05', 'epoch': '0.2733', 'num_input_tokens_seen': 22216091, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7483', 'grad_norm': '1.969', 'learning_rate': '4.985e-05', 'epoch': '0.2733', 'num_input_tokens_seen': 22218138, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6661', 'grad_norm': '1.584', 'learning_rate': '4.985e-05', 'epoch': '0.2733', 'num_input_tokens_seen': 22220185, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6759', 'grad_norm': '1.462', 'learning_rate': '4.985e-05', 'epoch': '0.2733', 'num_input_tokens_seen': 22222232, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6291', 'grad_norm': '1.323', 'learning_rate': '4.985e-05', 'epoch': '0.2734', 'num_input_tokens_seen': 22224279, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3642', 'grad_norm': '1.043', 'learning_rate': '4.985e-05', 'epoch': '0.2734', 'num_input_tokens_seen': 22226326, 'train_runtime': '1.124e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.33', 'grad_norm': '0.9281', 'learning_rate': '4.985e-05', 'epoch': '0.2734', 'num_input_tokens_seen': 22228373, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4687', 'grad_norm': '1.01', 'learning_rate': '4.985e-05', 'epoch': '0.2734', 'num_input_tokens_seen': 22230420, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3655', 'grad_norm': '0.8685', 'learning_rate': '4.985e-05', 'epoch': '0.2735', 'num_input_tokens_seen': 22232467, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4255', 'grad_norm': '1.226', 'learning_rate': '4.985e-05', 'epoch': '0.2735', 'num_input_tokens_seen': 22234514, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9362', 'grad_norm': '1.586', 'learning_rate': '4.985e-05', 'epoch': '0.2735', 'num_input_tokens_seen': 22236561, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5423', 'grad_norm': '1.16', 'learning_rate': '4.985e-05', 'epoch': '0.2735', 'num_input_tokens_seen': 22238608, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.772', 'grad_norm': '2.96', 'learning_rate': '4.985e-05', 'epoch': '0.2736', 'num_input_tokens_seen': 22240655, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4621', 'grad_norm': '0.9669', 'learning_rate': '4.985e-05', 'epoch': '0.2736', 'num_input_tokens_seen': 22242702, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.038', 'grad_norm': '1.757', 'learning_rate': '4.985e-05', 'epoch': '0.2736', 'num_input_tokens_seen': 22244749, 'train_runtime': '1.125e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.296', 'grad_norm': '0.9203', 'learning_rate': '4.985e-05', 'epoch': '0.2736', 'num_input_tokens_seen': 22246796, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3713', 'grad_norm': '0.8956', 'learning_rate': '4.985e-05', 'epoch': '0.2737', 'num_input_tokens_seen': 22248843, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9566', 'grad_norm': '1.452', 'learning_rate': '4.985e-05', 'epoch': '0.2737', 'num_input_tokens_seen': 22250890, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3463', 'grad_norm': '0.7203', 'learning_rate': '4.985e-05', 'epoch': '0.2737', 'num_input_tokens_seen': 22252937, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3448', 'grad_norm': '1.12', 'learning_rate': '4.985e-05', 'epoch': '0.2737', 'num_input_tokens_seen': 22254984, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.107', 'grad_norm': '1.885', 'learning_rate': '4.985e-05', 'epoch': '0.2738', 'num_input_tokens_seen': 22257031, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7605', 'grad_norm': '1.351', 'learning_rate': '4.985e-05', 'epoch': '0.2738', 'num_input_tokens_seen': 22259078, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8127', 'grad_norm': '1.747', 'learning_rate': '4.984e-05', 'epoch': '0.2738', 'num_input_tokens_seen': 22261125, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5088', 'grad_norm': '0.9675', 'learning_rate': '4.984e-05', 'epoch': '0.2738', 'num_input_tokens_seen': 22263172, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2525', 'grad_norm': '0.7898', 'learning_rate': '4.984e-05', 'epoch': '0.2739', 'num_input_tokens_seen': 22265219, 'train_runtime': '1.126e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6644', 'grad_norm': '1.47', 'learning_rate': '4.984e-05', 'epoch': '0.2739', 'num_input_tokens_seen': 22267266, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6262', 'grad_norm': '1.53', 'learning_rate': '4.984e-05', 'epoch': '0.2739', 'num_input_tokens_seen': 22269313, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.451', 'grad_norm': '2.419', 'learning_rate': '4.984e-05', 'epoch': '0.2739', 'num_input_tokens_seen': 22271360, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.429', 'grad_norm': '1.014', 'learning_rate': '4.984e-05', 'epoch': '0.274', 'num_input_tokens_seen': 22273407, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6822', 'grad_norm': '1.351', 'learning_rate': '4.984e-05', 'epoch': '0.274', 'num_input_tokens_seen': 22275454, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.1298', 'grad_norm': '0.7339', 'learning_rate': '4.984e-05', 'epoch': '0.274', 'num_input_tokens_seen': 22277501, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.596', 'grad_norm': '1.081', 'learning_rate': '4.984e-05', 'epoch': '0.274', 'num_input_tokens_seen': 22279548, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.264', 'grad_norm': '2.18', 'learning_rate': '4.984e-05', 'epoch': '0.2741', 'num_input_tokens_seen': 22281595, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3498', 'grad_norm': '0.8303', 'learning_rate': '4.984e-05', 'epoch': '0.2741', 'num_input_tokens_seen': 22283642, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.046', 'grad_norm': '1.379', 'learning_rate': '4.984e-05', 'epoch': '0.2741', 'num_input_tokens_seen': 22285689, 'train_runtime': '1.127e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5768', 'grad_norm': '1.222', 'learning_rate': '4.984e-05', 'epoch': '0.2741', 'num_input_tokens_seen': 22287736, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5893', 'grad_norm': '1.15', 'learning_rate': '4.984e-05', 'epoch': '0.2742', 'num_input_tokens_seen': 22289783, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7714', 'grad_norm': '2.226', 'learning_rate': '4.984e-05', 'epoch': '0.2742', 'num_input_tokens_seen': 22291830, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4548', 'grad_norm': '1.043', 'learning_rate': '4.984e-05', 'epoch': '0.2742', 'num_input_tokens_seen': 22293877, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8395', 'grad_norm': '1.736', 'learning_rate': '4.984e-05', 'epoch': '0.2742', 'num_input_tokens_seen': 22295924, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9377', 'grad_norm': '2.298', 'learning_rate': '4.984e-05', 'epoch': '0.2743', 'num_input_tokens_seen': 22297971, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3078', 'grad_norm': '0.8372', 'learning_rate': '4.984e-05', 'epoch': '0.2743', 'num_input_tokens_seen': 22300018, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7202', 'grad_norm': '1.101', 'learning_rate': '4.984e-05', 'epoch': '0.2743', 'num_input_tokens_seen': 22302065, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.624', 'grad_norm': '3.079', 'learning_rate': '4.984e-05', 'epoch': '0.2743', 'num_input_tokens_seen': 22304112, 'train_runtime': '1.128e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3032', 'grad_norm': '0.9955', 'learning_rate': '4.984e-05', 'epoch': '0.2744', 'num_input_tokens_seen': 22306159, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2048', 'grad_norm': '0.8931', 'learning_rate': '4.984e-05', 'epoch': '0.2744', 'num_input_tokens_seen': 22308206, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7855', 'grad_norm': '1.679', 'learning_rate': '4.984e-05', 'epoch': '0.2744', 'num_input_tokens_seen': 22310253, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5206', 'grad_norm': '0.9842', 'learning_rate': '4.984e-05', 'epoch': '0.2744', 'num_input_tokens_seen': 22312300, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8691', 'grad_norm': '1.372', 'learning_rate': '4.984e-05', 'epoch': '0.2745', 'num_input_tokens_seen': 22314347, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.623', 'grad_norm': '2.346', 'learning_rate': '4.984e-05', 'epoch': '0.2745', 'num_input_tokens_seen': 22316394, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.729', 'grad_norm': '1.271', 'learning_rate': '4.984e-05', 'epoch': '0.2745', 'num_input_tokens_seen': 22318441, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3153', 'grad_norm': '0.9947', 'learning_rate': '4.984e-05', 'epoch': '0.2745', 'num_input_tokens_seen': 22320488, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.456', 'grad_norm': '0.9762', 'learning_rate': '4.984e-05', 'epoch': '0.2746', 'num_input_tokens_seen': 22322535, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4356', 'grad_norm': '1.129', 'learning_rate': '4.984e-05', 'epoch': '0.2746', 'num_input_tokens_seen': 22324582, 'train_runtime': '1.129e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.328', 'grad_norm': '2.163', 'learning_rate': '4.984e-05', 'epoch': '0.2746', 'num_input_tokens_seen': 22326629, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3586', 'grad_norm': '1.069', 'learning_rate': '4.984e-05', 'epoch': '0.2746', 'num_input_tokens_seen': 22328676, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.422', 'grad_norm': '2.182', 'learning_rate': '4.984e-05', 'epoch': '0.2747', 'num_input_tokens_seen': 22330723, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2226', 'grad_norm': '0.7983', 'learning_rate': '4.984e-05', 'epoch': '0.2747', 'num_input_tokens_seen': 22332770, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3847', 'grad_norm': '0.9572', 'learning_rate': '4.984e-05', 'epoch': '0.2747', 'num_input_tokens_seen': 22334817, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.845', 'grad_norm': '1.177', 'learning_rate': '4.984e-05', 'epoch': '0.2747', 'num_input_tokens_seen': 22336864, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6027', 'grad_norm': '1.184', 'learning_rate': '4.984e-05', 'epoch': '0.2748', 'num_input_tokens_seen': 22338911, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.049', 'grad_norm': '1.872', 'learning_rate': '4.984e-05', 'epoch': '0.2748', 'num_input_tokens_seen': 22340958, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6723', 'grad_norm': '1.216', 'learning_rate': '4.984e-05', 'epoch': '0.2748', 'num_input_tokens_seen': 22343005, 'train_runtime': '1.13e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7415', 'grad_norm': '1.189', 'learning_rate': '4.984e-05', 'epoch': '0.2748', 'num_input_tokens_seen': 22345052, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.397', 'grad_norm': '0.922', 'learning_rate': '4.984e-05', 'epoch': '0.2749', 'num_input_tokens_seen': 22347099, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7517', 'grad_norm': '1.224', 'learning_rate': '4.984e-05', 'epoch': '0.2749', 'num_input_tokens_seen': 22349146, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5149', 'grad_norm': '1.425', 'learning_rate': '4.984e-05', 'epoch': '0.2749', 'num_input_tokens_seen': 22351193, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2505', 'grad_norm': '0.7727', 'learning_rate': '4.984e-05', 'epoch': '0.2749', 'num_input_tokens_seen': 22353240, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5537', 'grad_norm': '1.134', 'learning_rate': '4.984e-05', 'epoch': '0.275', 'num_input_tokens_seen': 22355287, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.477', 'grad_norm': '0.9877', 'learning_rate': '4.984e-05', 'epoch': '0.275', 'num_input_tokens_seen': 22357334, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6667', 'grad_norm': '1.314', 'learning_rate': '4.984e-05', 'epoch': '0.275', 'num_input_tokens_seen': 22359381, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.6864', 'grad_norm': '1.2', 'learning_rate': '4.984e-05', 'epoch': '0.275', 'num_input_tokens_seen': 22361428, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8005', 'grad_norm': '1.649', 'learning_rate': '4.984e-05', 'epoch': '0.2751', 'num_input_tokens_seen': 22363475, 'train_runtime': '1.131e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.215', 'grad_norm': '2.197', 'learning_rate': '4.984e-05', 'epoch': '0.2751', 'num_input_tokens_seen': 22365522, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.916', 'grad_norm': '1.834', 'learning_rate': '4.984e-05', 'epoch': '0.2751', 'num_input_tokens_seen': 22367569, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3763', 'grad_norm': '1.212', 'learning_rate': '4.984e-05', 'epoch': '0.2751', 'num_input_tokens_seen': 22369616, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9041', 'grad_norm': '1.482', 'learning_rate': '4.984e-05', 'epoch': '0.2752', 'num_input_tokens_seen': 22371663, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4225', 'grad_norm': '0.9593', 'learning_rate': '4.984e-05', 'epoch': '0.2752', 'num_input_tokens_seen': 22373710, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9762', 'grad_norm': '1.523', 'learning_rate': '4.984e-05', 'epoch': '0.2752', 'num_input_tokens_seen': 22375757, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.093', 'grad_norm': '2.01', 'learning_rate': '4.984e-05', 'epoch': '0.2752', 'num_input_tokens_seen': 22377804, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.064', 'grad_norm': '1.673', 'learning_rate': '4.984e-05', 'epoch': '0.2753', 'num_input_tokens_seen': 22379851, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.23', 'grad_norm': '1.619', 'learning_rate': '4.984e-05', 'epoch': '0.2753', 'num_input_tokens_seen': 22381898, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2755', 'grad_norm': '0.7643', 'learning_rate': '4.984e-05', 'epoch': '0.2753', 'num_input_tokens_seen': 22383945, 'train_runtime': '1.132e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3722', 'grad_norm': '1.065', 'learning_rate': '4.984e-05', 'epoch': '0.2753', 'num_input_tokens_seen': 22385992, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.153', 'grad_norm': '1.112', 'learning_rate': '4.984e-05', 'epoch': '0.2754', 'num_input_tokens_seen': 22388039, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7587', 'grad_norm': '1.437', 'learning_rate': '4.984e-05', 'epoch': '0.2754', 'num_input_tokens_seen': 22390086, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3212', 'grad_norm': '0.8712', 'learning_rate': '4.984e-05', 'epoch': '0.2754', 'num_input_tokens_seen': 22392133, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8044', 'grad_norm': '1.348', 'learning_rate': '4.984e-05', 'epoch': '0.2754', 'num_input_tokens_seen': 22394180, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3372', 'grad_norm': '0.8588', 'learning_rate': '4.984e-05', 'epoch': '0.2755', 'num_input_tokens_seen': 22396227, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.723', 'grad_norm': '2.221', 'learning_rate': '4.984e-05', 'epoch': '0.2755', 'num_input_tokens_seen': 22398274, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.5876', 'grad_norm': '1.309', 'learning_rate': '4.984e-05', 'epoch': '0.2755', 'num_input_tokens_seen': 22400321, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '2.297', 'grad_norm': '3.076', 'learning_rate': '4.984e-05', 'epoch': '0.2755', 'num_input_tokens_seen': 22402368, 'train_runtime': '1.133e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3814', 'grad_norm': '0.9651', 'learning_rate': '4.984e-05', 'epoch': '0.2756', 'num_input_tokens_seen': 22404415, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.89', 'grad_norm': '2.617', 'learning_rate': '4.984e-05', 'epoch': '0.2756', 'num_input_tokens_seen': 22406462, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8009', 'grad_norm': '1.355', 'learning_rate': '4.984e-05', 'epoch': '0.2756', 'num_input_tokens_seen': 22408509, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.9055', 'grad_norm': '1.314', 'learning_rate': '4.984e-05', 'epoch': '0.2757', 'num_input_tokens_seen': 22410556, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7733', 'grad_norm': '1.341', 'learning_rate': '4.984e-05', 'epoch': '0.2757', 'num_input_tokens_seen': 22412603, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3109', 'grad_norm': '0.8354', 'learning_rate': '4.984e-05', 'epoch': '0.2757', 'num_input_tokens_seen': 22414650, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.234', 'grad_norm': '2.106', 'learning_rate': '4.984e-05', 'epoch': '0.2757', 'num_input_tokens_seen': 22416697, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7127', 'grad_norm': '0.9543', 'learning_rate': '4.984e-05', 'epoch': '0.2758', 'num_input_tokens_seen': 22418744, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8562', 'grad_norm': '1.134', 'learning_rate': '4.984e-05', 'epoch': '0.2758', 'num_input_tokens_seen': 22420791, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.7603', 'grad_norm': '1.334', 'learning_rate': '4.984e-05', 'epoch': '0.2758', 'num_input_tokens_seen': 22422838, 'train_runtime': '1.134e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4836', 'grad_norm': '1.148', 'learning_rate': '4.984e-05', 'epoch': '0.2758', 'num_input_tokens_seen': 22424885, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3066', 'grad_norm': '0.8526', 'learning_rate': '4.984e-05', 'epoch': '0.2759', 'num_input_tokens_seen': 22426932, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.805', 'grad_norm': '2.123', 'learning_rate': '4.984e-05', 'epoch': '0.2759', 'num_input_tokens_seen': 22428979, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4751', 'grad_norm': '1.326', 'learning_rate': '4.984e-05', 'epoch': '0.2759', 'num_input_tokens_seen': 22431026, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.154', 'grad_norm': '1.612', 'learning_rate': '4.984e-05', 'epoch': '0.2759', 'num_input_tokens_seen': 22433073, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3005', 'grad_norm': '0.7994', 'learning_rate': '4.984e-05', 'epoch': '0.276', 'num_input_tokens_seen': 22435120, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.681', 'grad_norm': '2.188', 'learning_rate': '4.984e-05', 'epoch': '0.276', 'num_input_tokens_seen': 22437167, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8854', 'grad_norm': '1.241', 'learning_rate': '4.984e-05', 'epoch': '0.276', 'num_input_tokens_seen': 22439214, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3495', 'grad_norm': '0.872', 'learning_rate': '4.984e-05', 'epoch': '0.276', 'num_input_tokens_seen': 22441261, 'train_runtime': '1.135e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4187', 'grad_norm': '1.042', 'learning_rate': '4.984e-05', 'epoch': '0.2761', 'num_input_tokens_seen': 22443308, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.8562', 'grad_norm': '1.087', 'learning_rate': '4.984e-05', 'epoch': '0.2761', 'num_input_tokens_seen': 22445355, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.351', 'grad_norm': '1.034', 'learning_rate': '4.984e-05', 'epoch': '0.2761', 'num_input_tokens_seen': 22447402, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4428', 'grad_norm': '1.16', 'learning_rate': '4.984e-05', 'epoch': '0.2761', 'num_input_tokens_seen': 22449449, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3395', 'grad_norm': '0.8056', 'learning_rate': '4.984e-05', 'epoch': '0.2762', 'num_input_tokens_seen': 22451496, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.334', 'grad_norm': '1.047', 'learning_rate': '4.984e-05', 'epoch': '0.2762', 'num_input_tokens_seen': 22453543, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.3852', 'grad_norm': '0.9896', 'learning_rate': '4.984e-05', 'epoch': '0.2762', 'num_input_tokens_seen': 22455590, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.298', 'grad_norm': '2.022', 'learning_rate': '4.984e-05', 'epoch': '0.2762', 'num_input_tokens_seen': 22457637, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.4953', 'grad_norm': '1.296', 'learning_rate': '4.984e-05', 'epoch': '0.2763', 'num_input_tokens_seen': 22459684, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '1.352', 'grad_norm': '2.541', 'learning_rate': '4.984e-05', 'epoch': '0.2763', 'num_input_tokens_seen': 22461731, 'train_runtime': '1.136e+04', 'train_tokens_per_second': '1977'} +{'loss': '0.2559', 'grad_norm': '0.9171', 'learning_rate': '4.984e-05', 'epoch': '0.2763', 'num_input_tokens_seen': 22463778, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3337', 'grad_norm': '1.014', 'learning_rate': '4.984e-05', 'epoch': '0.2763', 'num_input_tokens_seen': 22465825, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8584', 'grad_norm': '1.378', 'learning_rate': '4.984e-05', 'epoch': '0.2764', 'num_input_tokens_seen': 22467872, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9717', 'grad_norm': '2.38', 'learning_rate': '4.984e-05', 'epoch': '0.2764', 'num_input_tokens_seen': 22469919, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2477', 'grad_norm': '0.8795', 'learning_rate': '4.984e-05', 'epoch': '0.2764', 'num_input_tokens_seen': 22471966, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.341', 'grad_norm': '0.9391', 'learning_rate': '4.984e-05', 'epoch': '0.2764', 'num_input_tokens_seen': 22474013, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4475', 'grad_norm': '1.223', 'learning_rate': '4.984e-05', 'epoch': '0.2765', 'num_input_tokens_seen': 22476060, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5943', 'grad_norm': '1.435', 'learning_rate': '4.984e-05', 'epoch': '0.2765', 'num_input_tokens_seen': 22478107, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2691', 'grad_norm': '0.9367', 'learning_rate': '4.984e-05', 'epoch': '0.2765', 'num_input_tokens_seen': 22480154, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.199', 'grad_norm': '1.902', 'learning_rate': '4.984e-05', 'epoch': '0.2765', 'num_input_tokens_seen': 22482201, 'train_runtime': '1.137e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.388', 'grad_norm': '2.225', 'learning_rate': '4.984e-05', 'epoch': '0.2766', 'num_input_tokens_seen': 22484248, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.222', 'grad_norm': '1.537', 'learning_rate': '4.984e-05', 'epoch': '0.2766', 'num_input_tokens_seen': 22486295, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.059', 'grad_norm': '1.931', 'learning_rate': '4.984e-05', 'epoch': '0.2766', 'num_input_tokens_seen': 22488342, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6796', 'grad_norm': '1.348', 'learning_rate': '4.984e-05', 'epoch': '0.2766', 'num_input_tokens_seen': 22490389, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.538', 'grad_norm': '1.317', 'learning_rate': '4.984e-05', 'epoch': '0.2767', 'num_input_tokens_seen': 22492436, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.159', 'grad_norm': '1.986', 'learning_rate': '4.984e-05', 'epoch': '0.2767', 'num_input_tokens_seen': 22494483, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8769', 'grad_norm': '2.105', 'learning_rate': '4.984e-05', 'epoch': '0.2767', 'num_input_tokens_seen': 22496530, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6233', 'grad_norm': '1.343', 'learning_rate': '4.984e-05', 'epoch': '0.2767', 'num_input_tokens_seen': 22498577, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.328', 'grad_norm': '1.714', 'learning_rate': '4.984e-05', 'epoch': '0.2768', 'num_input_tokens_seen': 22500624, 'train_runtime': '1.138e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3599', 'grad_norm': '0.9884', 'learning_rate': '4.984e-05', 'epoch': '0.2768', 'num_input_tokens_seen': 22502671, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4769', 'grad_norm': '1.464', 'learning_rate': '4.984e-05', 'epoch': '0.2768', 'num_input_tokens_seen': 22504718, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7942', 'grad_norm': '1.44', 'learning_rate': '4.984e-05', 'epoch': '0.2768', 'num_input_tokens_seen': 22506765, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.134', 'grad_norm': '1.996', 'learning_rate': '4.984e-05', 'epoch': '0.2769', 'num_input_tokens_seen': 22508812, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3942', 'grad_norm': '1.199', 'learning_rate': '4.984e-05', 'epoch': '0.2769', 'num_input_tokens_seen': 22510859, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3707', 'grad_norm': '0.9902', 'learning_rate': '4.984e-05', 'epoch': '0.2769', 'num_input_tokens_seen': 22512906, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.284', 'grad_norm': '2.33', 'learning_rate': '4.984e-05', 'epoch': '0.2769', 'num_input_tokens_seen': 22514953, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5129', 'grad_norm': '1.019', 'learning_rate': '4.984e-05', 'epoch': '0.277', 'num_input_tokens_seen': 22517000, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +[INFO|configuration_utils.py:665] 2026-02-05 05:47:17,895 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 05:47:17,895 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 05:47:18,263 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-11000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 05:47:18,271 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-11000/tokenizer_config.json + +{'loss': '0.6677', 'grad_norm': '1.302', 'learning_rate': '4.984e-05', 'epoch': '0.277', 'num_input_tokens_seen': 22519047, 'train_runtime': '1.139e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8686', 'grad_norm': '1.546', 'learning_rate': '4.984e-05', 'epoch': '0.277', 'num_input_tokens_seen': 22521094, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.037', 'grad_norm': '2.458', 'learning_rate': '4.984e-05', 'epoch': '0.277', 'num_input_tokens_seen': 22523141, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4159', 'grad_norm': '1.006', 'learning_rate': '4.984e-05', 'epoch': '0.2771', 'num_input_tokens_seen': 22525188, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.232', 'grad_norm': '0.8207', 'learning_rate': '4.984e-05', 'epoch': '0.2771', 'num_input_tokens_seen': 22527235, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4593', 'grad_norm': '1.163', 'learning_rate': '4.984e-05', 'epoch': '0.2771', 'num_input_tokens_seen': 22529282, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5984', 'grad_norm': '1.059', 'learning_rate': '4.984e-05', 'epoch': '0.2771', 'num_input_tokens_seen': 22531329, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5985', 'grad_norm': '1.361', 'learning_rate': '4.984e-05', 'epoch': '0.2772', 'num_input_tokens_seen': 22533376, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3388', 'grad_norm': '0.7541', 'learning_rate': '4.984e-05', 'epoch': '0.2772', 'num_input_tokens_seen': 22535423, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.192', 'grad_norm': '2.142', 'learning_rate': '4.984e-05', 'epoch': '0.2772', 'num_input_tokens_seen': 22537470, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.827', 'grad_norm': '2.75', 'learning_rate': '4.984e-05', 'epoch': '0.2772', 'num_input_tokens_seen': 22539517, 'train_runtime': '1.14e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6171', 'grad_norm': '1.464', 'learning_rate': '4.984e-05', 'epoch': '0.2773', 'num_input_tokens_seen': 22541564, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.637', 'grad_norm': '1.923', 'learning_rate': '4.984e-05', 'epoch': '0.2773', 'num_input_tokens_seen': 22543611, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2799', 'grad_norm': '0.9151', 'learning_rate': '4.984e-05', 'epoch': '0.2773', 'num_input_tokens_seen': 22545658, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5091', 'grad_norm': '1.204', 'learning_rate': '4.984e-05', 'epoch': '0.2773', 'num_input_tokens_seen': 22547705, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.059', 'grad_norm': '2.104', 'learning_rate': '4.984e-05', 'epoch': '0.2774', 'num_input_tokens_seen': 22549752, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.549', 'grad_norm': '2.968', 'learning_rate': '4.984e-05', 'epoch': '0.2774', 'num_input_tokens_seen': 22551799, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.979', 'grad_norm': '1.861', 'learning_rate': '4.984e-05', 'epoch': '0.2774', 'num_input_tokens_seen': 22553846, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.587', 'grad_norm': '1.442', 'learning_rate': '4.984e-05', 'epoch': '0.2774', 'num_input_tokens_seen': 22555893, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6586', 'grad_norm': '1.231', 'learning_rate': '4.984e-05', 'epoch': '0.2775', 'num_input_tokens_seen': 22557940, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.02', 'grad_norm': '1.864', 'learning_rate': '4.984e-05', 'epoch': '0.2775', 'num_input_tokens_seen': 22559987, 'train_runtime': '1.141e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8867', 'grad_norm': '1.371', 'learning_rate': '4.984e-05', 'epoch': '0.2775', 'num_input_tokens_seen': 22562034, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9609', 'grad_norm': '1.406', 'learning_rate': '4.984e-05', 'epoch': '0.2775', 'num_input_tokens_seen': 22564081, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3021', 'grad_norm': '0.7669', 'learning_rate': '4.984e-05', 'epoch': '0.2776', 'num_input_tokens_seen': 22566128, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.391', 'grad_norm': '1.303', 'learning_rate': '4.984e-05', 'epoch': '0.2776', 'num_input_tokens_seen': 22568175, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3294', 'grad_norm': '0.831', 'learning_rate': '4.984e-05', 'epoch': '0.2776', 'num_input_tokens_seen': 22570222, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.013', 'grad_norm': '2.903', 'learning_rate': '4.984e-05', 'epoch': '0.2776', 'num_input_tokens_seen': 22572269, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3788', 'grad_norm': '0.95', 'learning_rate': '4.984e-05', 'epoch': '0.2777', 'num_input_tokens_seen': 22574316, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7653', 'grad_norm': '1.194', 'learning_rate': '4.984e-05', 'epoch': '0.2777', 'num_input_tokens_seen': 22576363, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3474', 'grad_norm': '0.7861', 'learning_rate': '4.984e-05', 'epoch': '0.2777', 'num_input_tokens_seen': 22578410, 'train_runtime': '1.142e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9585', 'grad_norm': '1.558', 'learning_rate': '4.984e-05', 'epoch': '0.2777', 'num_input_tokens_seen': 22580457, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3112', 'grad_norm': '0.9425', 'learning_rate': '4.984e-05', 'epoch': '0.2778', 'num_input_tokens_seen': 22582504, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2356', 'grad_norm': '0.844', 'learning_rate': '4.984e-05', 'epoch': '0.2778', 'num_input_tokens_seen': 22584551, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4586', 'grad_norm': '0.8828', 'learning_rate': '4.984e-05', 'epoch': '0.2778', 'num_input_tokens_seen': 22586598, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.228', 'grad_norm': '0.8759', 'learning_rate': '4.984e-05', 'epoch': '0.2778', 'num_input_tokens_seen': 22588645, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3261', 'grad_norm': '0.9272', 'learning_rate': '4.984e-05', 'epoch': '0.2779', 'num_input_tokens_seen': 22590692, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3835', 'grad_norm': '0.9093', 'learning_rate': '4.984e-05', 'epoch': '0.2779', 'num_input_tokens_seen': 22592739, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4009', 'grad_norm': '0.9268', 'learning_rate': '4.984e-05', 'epoch': '0.2779', 'num_input_tokens_seen': 22594786, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7218', 'grad_norm': '1.303', 'learning_rate': '4.984e-05', 'epoch': '0.2779', 'num_input_tokens_seen': 22596833, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.677', 'grad_norm': '3.2', 'learning_rate': '4.984e-05', 'epoch': '0.278', 'num_input_tokens_seen': 22598880, 'train_runtime': '1.143e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4745', 'grad_norm': '1.153', 'learning_rate': '4.984e-05', 'epoch': '0.278', 'num_input_tokens_seen': 22600927, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6852', 'grad_norm': '1.285', 'learning_rate': '4.984e-05', 'epoch': '0.278', 'num_input_tokens_seen': 22602974, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7561', 'grad_norm': '1.346', 'learning_rate': '4.984e-05', 'epoch': '0.278', 'num_input_tokens_seen': 22605021, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2958', 'grad_norm': '0.8985', 'learning_rate': '4.984e-05', 'epoch': '0.2781', 'num_input_tokens_seen': 22607068, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.197', 'grad_norm': '0.8472', 'learning_rate': '4.984e-05', 'epoch': '0.2781', 'num_input_tokens_seen': 22609115, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9898', 'grad_norm': '1.496', 'learning_rate': '4.984e-05', 'epoch': '0.2781', 'num_input_tokens_seen': 22611162, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5449', 'grad_norm': '1.526', 'learning_rate': '4.984e-05', 'epoch': '0.2781', 'num_input_tokens_seen': 22613209, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2978', 'grad_norm': '0.9094', 'learning_rate': '4.984e-05', 'epoch': '0.2782', 'num_input_tokens_seen': 22615256, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4301', 'grad_norm': '0.8657', 'learning_rate': '4.984e-05', 'epoch': '0.2782', 'num_input_tokens_seen': 22617303, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2085', 'grad_norm': '0.8773', 'learning_rate': '4.984e-05', 'epoch': '0.2782', 'num_input_tokens_seen': 22619350, 'train_runtime': '1.144e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6663', 'grad_norm': '1.049', 'learning_rate': '4.984e-05', 'epoch': '0.2782', 'num_input_tokens_seen': 22621397, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4258', 'grad_norm': '1.065', 'learning_rate': '4.984e-05', 'epoch': '0.2783', 'num_input_tokens_seen': 22623444, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.012', 'grad_norm': '1.674', 'learning_rate': '4.984e-05', 'epoch': '0.2783', 'num_input_tokens_seen': 22625491, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.944', 'grad_norm': '1.504', 'learning_rate': '4.984e-05', 'epoch': '0.2783', 'num_input_tokens_seen': 22627538, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.609', 'grad_norm': '1.476', 'learning_rate': '4.984e-05', 'epoch': '0.2783', 'num_input_tokens_seen': 22629585, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.343', 'grad_norm': '1.933', 'learning_rate': '4.984e-05', 'epoch': '0.2784', 'num_input_tokens_seen': 22631632, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8712', 'grad_norm': '1.249', 'learning_rate': '4.984e-05', 'epoch': '0.2784', 'num_input_tokens_seen': 22633679, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.519', 'grad_norm': '2.451', 'learning_rate': '4.984e-05', 'epoch': '0.2784', 'num_input_tokens_seen': 22635726, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9857', 'grad_norm': '1.333', 'learning_rate': '4.984e-05', 'epoch': '0.2784', 'num_input_tokens_seen': 22637773, 'train_runtime': '1.145e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4562', 'grad_norm': '0.8588', 'learning_rate': '4.984e-05', 'epoch': '0.2785', 'num_input_tokens_seen': 22639820, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8341', 'grad_norm': '1.238', 'learning_rate': '4.984e-05', 'epoch': '0.2785', 'num_input_tokens_seen': 22641867, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4804', 'grad_norm': '1.307', 'learning_rate': '4.984e-05', 'epoch': '0.2785', 'num_input_tokens_seen': 22643914, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.549', 'grad_norm': '5.231', 'learning_rate': '4.984e-05', 'epoch': '0.2785', 'num_input_tokens_seen': 22645961, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7408', 'grad_norm': '1.594', 'learning_rate': '4.984e-05', 'epoch': '0.2786', 'num_input_tokens_seen': 22648008, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.383', 'grad_norm': '1.031', 'learning_rate': '4.984e-05', 'epoch': '0.2786', 'num_input_tokens_seen': 22650055, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.239', 'grad_norm': '1.024', 'learning_rate': '4.984e-05', 'epoch': '0.2786', 'num_input_tokens_seen': 22652102, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4017', 'grad_norm': '1.068', 'learning_rate': '4.984e-05', 'epoch': '0.2786', 'num_input_tokens_seen': 22654149, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5195', 'grad_norm': '1.112', 'learning_rate': '4.984e-05', 'epoch': '0.2787', 'num_input_tokens_seen': 22656196, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3069', 'grad_norm': '0.8239', 'learning_rate': '4.984e-05', 'epoch': '0.2787', 'num_input_tokens_seen': 22658243, 'train_runtime': '1.146e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5909', 'grad_norm': '1.664', 'learning_rate': '4.984e-05', 'epoch': '0.2787', 'num_input_tokens_seen': 22660290, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.518', 'grad_norm': '0.9142', 'learning_rate': '4.984e-05', 'epoch': '0.2787', 'num_input_tokens_seen': 22662337, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9459', 'grad_norm': '1.594', 'learning_rate': '4.984e-05', 'epoch': '0.2788', 'num_input_tokens_seen': 22664384, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7027', 'grad_norm': '1.261', 'learning_rate': '4.984e-05', 'epoch': '0.2788', 'num_input_tokens_seen': 22666431, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.441', 'grad_norm': '0.9914', 'learning_rate': '4.984e-05', 'epoch': '0.2788', 'num_input_tokens_seen': 22668478, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8435', 'grad_norm': '1.453', 'learning_rate': '4.984e-05', 'epoch': '0.2788', 'num_input_tokens_seen': 22670525, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5264', 'grad_norm': '1.179', 'learning_rate': '4.984e-05', 'epoch': '0.2789', 'num_input_tokens_seen': 22672572, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3254', 'grad_norm': '0.8927', 'learning_rate': '4.984e-05', 'epoch': '0.2789', 'num_input_tokens_seen': 22674619, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4466', 'grad_norm': '1.071', 'learning_rate': '4.984e-05', 'epoch': '0.2789', 'num_input_tokens_seen': 22676666, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.643', 'grad_norm': '2.502', 'learning_rate': '4.984e-05', 'epoch': '0.2789', 'num_input_tokens_seen': 22678713, 'train_runtime': '1.147e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.066', 'grad_norm': '1.619', 'learning_rate': '4.984e-05', 'epoch': '0.279', 'num_input_tokens_seen': 22680760, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6284', 'grad_norm': '1.344', 'learning_rate': '4.984e-05', 'epoch': '0.279', 'num_input_tokens_seen': 22682807, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.21', 'grad_norm': '1.665', 'learning_rate': '4.984e-05', 'epoch': '0.279', 'num_input_tokens_seen': 22684854, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3608', 'grad_norm': '0.881', 'learning_rate': '4.984e-05', 'epoch': '0.279', 'num_input_tokens_seen': 22686901, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5983', 'grad_norm': '1.008', 'learning_rate': '4.984e-05', 'epoch': '0.2791', 'num_input_tokens_seen': 22688948, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.561', 'grad_norm': '1.725', 'learning_rate': '4.984e-05', 'epoch': '0.2791', 'num_input_tokens_seen': 22690995, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7182', 'grad_norm': '1.392', 'learning_rate': '4.984e-05', 'epoch': '0.2791', 'num_input_tokens_seen': 22693042, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3715', 'grad_norm': '0.8836', 'learning_rate': '4.984e-05', 'epoch': '0.2791', 'num_input_tokens_seen': 22695089, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8548', 'grad_norm': '1.541', 'learning_rate': '4.984e-05', 'epoch': '0.2792', 'num_input_tokens_seen': 22697136, 'train_runtime': '1.148e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.842', 'grad_norm': '1.909', 'learning_rate': '4.984e-05', 'epoch': '0.2792', 'num_input_tokens_seen': 22699183, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9', 'grad_norm': '1.675', 'learning_rate': '4.984e-05', 'epoch': '0.2792', 'num_input_tokens_seen': 22701230, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6507', 'grad_norm': '0.9149', 'learning_rate': '4.984e-05', 'epoch': '0.2793', 'num_input_tokens_seen': 22703277, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3396', 'grad_norm': '0.8309', 'learning_rate': '4.984e-05', 'epoch': '0.2793', 'num_input_tokens_seen': 22705324, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5538', 'grad_norm': '0.851', 'learning_rate': '4.984e-05', 'epoch': '0.2793', 'num_input_tokens_seen': 22707371, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.399', 'grad_norm': '2.042', 'learning_rate': '4.984e-05', 'epoch': '0.2793', 'num_input_tokens_seen': 22709418, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1734', 'grad_norm': '0.7742', 'learning_rate': '4.983e-05', 'epoch': '0.2794', 'num_input_tokens_seen': 22711465, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4794', 'grad_norm': '1.048', 'learning_rate': '4.983e-05', 'epoch': '0.2794', 'num_input_tokens_seen': 22713512, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7011', 'grad_norm': '1.263', 'learning_rate': '4.983e-05', 'epoch': '0.2794', 'num_input_tokens_seen': 22715559, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4963', 'grad_norm': '1.182', 'learning_rate': '4.983e-05', 'epoch': '0.2794', 'num_input_tokens_seen': 22717606, 'train_runtime': '1.149e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5593', 'grad_norm': '1.086', 'learning_rate': '4.983e-05', 'epoch': '0.2795', 'num_input_tokens_seen': 22719653, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8976', 'grad_norm': '1.774', 'learning_rate': '4.983e-05', 'epoch': '0.2795', 'num_input_tokens_seen': 22721700, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.681', 'grad_norm': '2.307', 'learning_rate': '4.983e-05', 'epoch': '0.2795', 'num_input_tokens_seen': 22723747, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4591', 'grad_norm': '1.215', 'learning_rate': '4.983e-05', 'epoch': '0.2795', 'num_input_tokens_seen': 22725794, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.632', 'grad_norm': '2.734', 'learning_rate': '4.983e-05', 'epoch': '0.2796', 'num_input_tokens_seen': 22727841, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2219', 'grad_norm': '0.916', 'learning_rate': '4.983e-05', 'epoch': '0.2796', 'num_input_tokens_seen': 22729888, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.08', 'grad_norm': '1.88', 'learning_rate': '4.983e-05', 'epoch': '0.2796', 'num_input_tokens_seen': 22731935, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.4', 'grad_norm': '2.022', 'learning_rate': '4.983e-05', 'epoch': '0.2796', 'num_input_tokens_seen': 22733982, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4374', 'grad_norm': '1.169', 'learning_rate': '4.983e-05', 'epoch': '0.2797', 'num_input_tokens_seen': 22736029, 'train_runtime': '1.15e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4457', 'grad_norm': '1.314', 'learning_rate': '4.983e-05', 'epoch': '0.2797', 'num_input_tokens_seen': 22738076, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5582', 'grad_norm': '1.472', 'learning_rate': '4.983e-05', 'epoch': '0.2797', 'num_input_tokens_seen': 22740123, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2393', 'grad_norm': '0.8801', 'learning_rate': '4.983e-05', 'epoch': '0.2797', 'num_input_tokens_seen': 22742170, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.076', 'grad_norm': '1.145', 'learning_rate': '4.983e-05', 'epoch': '0.2798', 'num_input_tokens_seen': 22744217, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.175', 'grad_norm': '2.537', 'learning_rate': '4.983e-05', 'epoch': '0.2798', 'num_input_tokens_seen': 22746264, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6398', 'grad_norm': '0.8323', 'learning_rate': '4.983e-05', 'epoch': '0.2798', 'num_input_tokens_seen': 22748311, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5012', 'grad_norm': '1.471', 'learning_rate': '4.983e-05', 'epoch': '0.2798', 'num_input_tokens_seen': 22750358, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5144', 'grad_norm': '1.185', 'learning_rate': '4.983e-05', 'epoch': '0.2799', 'num_input_tokens_seen': 22752405, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2632', 'grad_norm': '0.8207', 'learning_rate': '4.983e-05', 'epoch': '0.2799', 'num_input_tokens_seen': 22754452, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.263', 'grad_norm': '2.641', 'learning_rate': '4.983e-05', 'epoch': '0.2799', 'num_input_tokens_seen': 22756499, 'train_runtime': '1.151e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2629', 'grad_norm': '0.9009', 'learning_rate': '4.983e-05', 'epoch': '0.2799', 'num_input_tokens_seen': 22758546, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.1', 'grad_norm': '2.227', 'learning_rate': '4.983e-05', 'epoch': '0.28', 'num_input_tokens_seen': 22760593, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3023', 'grad_norm': '0.8998', 'learning_rate': '4.983e-05', 'epoch': '0.28', 'num_input_tokens_seen': 22762640, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4885', 'grad_norm': '0.8715', 'learning_rate': '4.983e-05', 'epoch': '0.28', 'num_input_tokens_seen': 22764687, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5771', 'grad_norm': '1.167', 'learning_rate': '4.983e-05', 'epoch': '0.28', 'num_input_tokens_seen': 22766734, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3731', 'grad_norm': '0.9023', 'learning_rate': '4.983e-05', 'epoch': '0.2801', 'num_input_tokens_seen': 22768781, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.024', 'grad_norm': '1.879', 'learning_rate': '4.983e-05', 'epoch': '0.2801', 'num_input_tokens_seen': 22770828, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6412', 'grad_norm': '1.257', 'learning_rate': '4.983e-05', 'epoch': '0.2801', 'num_input_tokens_seen': 22772875, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5359', 'grad_norm': '1.353', 'learning_rate': '4.983e-05', 'epoch': '0.2801', 'num_input_tokens_seen': 22774922, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8643', 'grad_norm': '1.393', 'learning_rate': '4.983e-05', 'epoch': '0.2802', 'num_input_tokens_seen': 22776969, 'train_runtime': '1.152e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9321', 'grad_norm': '1.469', 'learning_rate': '4.983e-05', 'epoch': '0.2802', 'num_input_tokens_seen': 22779016, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7254', 'grad_norm': '1.107', 'learning_rate': '4.983e-05', 'epoch': '0.2802', 'num_input_tokens_seen': 22781063, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3061', 'grad_norm': '0.7522', 'learning_rate': '4.983e-05', 'epoch': '0.2802', 'num_input_tokens_seen': 22783110, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9055', 'grad_norm': '1.73', 'learning_rate': '4.983e-05', 'epoch': '0.2803', 'num_input_tokens_seen': 22785157, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.305', 'grad_norm': '0.7645', 'learning_rate': '4.983e-05', 'epoch': '0.2803', 'num_input_tokens_seen': 22787204, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6325', 'grad_norm': '1.174', 'learning_rate': '4.983e-05', 'epoch': '0.2803', 'num_input_tokens_seen': 22789251, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.84', 'grad_norm': '1.238', 'learning_rate': '4.983e-05', 'epoch': '0.2803', 'num_input_tokens_seen': 22791298, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9637', 'grad_norm': '1.536', 'learning_rate': '4.983e-05', 'epoch': '0.2804', 'num_input_tokens_seen': 22793345, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2914', 'grad_norm': '0.895', 'learning_rate': '4.983e-05', 'epoch': '0.2804', 'num_input_tokens_seen': 22795392, 'train_runtime': '1.153e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8609', 'grad_norm': '1.447', 'learning_rate': '4.983e-05', 'epoch': '0.2804', 'num_input_tokens_seen': 22797439, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4265', 'grad_norm': '1.321', 'learning_rate': '4.983e-05', 'epoch': '0.2804', 'num_input_tokens_seen': 22799486, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.267', 'grad_norm': '2.104', 'learning_rate': '4.983e-05', 'epoch': '0.2805', 'num_input_tokens_seen': 22801533, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2525', 'grad_norm': '0.8735', 'learning_rate': '4.983e-05', 'epoch': '0.2805', 'num_input_tokens_seen': 22803580, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2973', 'grad_norm': '0.8255', 'learning_rate': '4.983e-05', 'epoch': '0.2805', 'num_input_tokens_seen': 22805627, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4805', 'grad_norm': '1.149', 'learning_rate': '4.983e-05', 'epoch': '0.2805', 'num_input_tokens_seen': 22807674, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.639', 'grad_norm': '2.146', 'learning_rate': '4.983e-05', 'epoch': '0.2806', 'num_input_tokens_seen': 22809721, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.382', 'grad_norm': '1.32', 'learning_rate': '4.983e-05', 'epoch': '0.2806', 'num_input_tokens_seen': 22811768, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6687', 'grad_norm': '1.32', 'learning_rate': '4.983e-05', 'epoch': '0.2806', 'num_input_tokens_seen': 22813815, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9201', 'grad_norm': '1.585', 'learning_rate': '4.983e-05', 'epoch': '0.2806', 'num_input_tokens_seen': 22815862, 'train_runtime': '1.154e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4025', 'grad_norm': '1.027', 'learning_rate': '4.983e-05', 'epoch': '0.2807', 'num_input_tokens_seen': 22817909, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9801', 'grad_norm': '2.138', 'learning_rate': '4.983e-05', 'epoch': '0.2807', 'num_input_tokens_seen': 22819956, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6905', 'grad_norm': '1.305', 'learning_rate': '4.983e-05', 'epoch': '0.2807', 'num_input_tokens_seen': 22822003, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7037', 'grad_norm': '1.27', 'learning_rate': '4.983e-05', 'epoch': '0.2807', 'num_input_tokens_seen': 22824050, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6055', 'grad_norm': '1.017', 'learning_rate': '4.983e-05', 'epoch': '0.2808', 'num_input_tokens_seen': 22826097, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7145', 'grad_norm': '1.365', 'learning_rate': '4.983e-05', 'epoch': '0.2808', 'num_input_tokens_seen': 22828144, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9248', 'grad_norm': '1.884', 'learning_rate': '4.983e-05', 'epoch': '0.2808', 'num_input_tokens_seen': 22830191, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.268', 'grad_norm': '1.802', 'learning_rate': '4.983e-05', 'epoch': '0.2808', 'num_input_tokens_seen': 22832238, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6163', 'grad_norm': '1.262', 'learning_rate': '4.983e-05', 'epoch': '0.2809', 'num_input_tokens_seen': 22834285, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4156', 'grad_norm': '1.027', 'learning_rate': '4.983e-05', 'epoch': '0.2809', 'num_input_tokens_seen': 22836332, 'train_runtime': '1.155e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.674', 'grad_norm': '2.379', 'learning_rate': '4.983e-05', 'epoch': '0.2809', 'num_input_tokens_seen': 22838379, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2557', 'grad_norm': '0.8359', 'learning_rate': '4.983e-05', 'epoch': '0.2809', 'num_input_tokens_seen': 22840426, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5412', 'grad_norm': '1.37', 'learning_rate': '4.983e-05', 'epoch': '0.281', 'num_input_tokens_seen': 22842473, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5589', 'grad_norm': '1.347', 'learning_rate': '4.983e-05', 'epoch': '0.281', 'num_input_tokens_seen': 22844520, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6973', 'grad_norm': '1.679', 'learning_rate': '4.983e-05', 'epoch': '0.281', 'num_input_tokens_seen': 22846567, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.04', 'grad_norm': '1.502', 'learning_rate': '4.983e-05', 'epoch': '0.281', 'num_input_tokens_seen': 22848614, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.06', 'grad_norm': '1.427', 'learning_rate': '4.983e-05', 'epoch': '0.2811', 'num_input_tokens_seen': 22850661, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.753', 'grad_norm': '1.441', 'learning_rate': '4.983e-05', 'epoch': '0.2811', 'num_input_tokens_seen': 22852708, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4735', 'grad_norm': '1.134', 'learning_rate': '4.983e-05', 'epoch': '0.2811', 'num_input_tokens_seen': 22854755, 'train_runtime': '1.156e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7803', 'grad_norm': '1.382', 'learning_rate': '4.983e-05', 'epoch': '0.2811', 'num_input_tokens_seen': 22856802, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8673', 'grad_norm': '1.182', 'learning_rate': '4.983e-05', 'epoch': '0.2812', 'num_input_tokens_seen': 22858849, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3478', 'grad_norm': '0.8667', 'learning_rate': '4.983e-05', 'epoch': '0.2812', 'num_input_tokens_seen': 22860896, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.603', 'grad_norm': '1.347', 'learning_rate': '4.983e-05', 'epoch': '0.2812', 'num_input_tokens_seen': 22862943, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.494', 'grad_norm': '2.031', 'learning_rate': '4.983e-05', 'epoch': '0.2812', 'num_input_tokens_seen': 22864990, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7619', 'grad_norm': '1.006', 'learning_rate': '4.983e-05', 'epoch': '0.2813', 'num_input_tokens_seen': 22867037, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9525', 'grad_norm': '1.063', 'learning_rate': '4.983e-05', 'epoch': '0.2813', 'num_input_tokens_seen': 22869084, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.549', 'grad_norm': '2.272', 'learning_rate': '4.983e-05', 'epoch': '0.2813', 'num_input_tokens_seen': 22871131, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6999', 'grad_norm': '0.9621', 'learning_rate': '4.983e-05', 'epoch': '0.2813', 'num_input_tokens_seen': 22873178, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.027', 'grad_norm': '1.28', 'learning_rate': '4.983e-05', 'epoch': '0.2814', 'num_input_tokens_seen': 22875225, 'train_runtime': '1.157e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.946', 'grad_norm': '2.372', 'learning_rate': '4.983e-05', 'epoch': '0.2814', 'num_input_tokens_seen': 22877272, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.324', 'grad_norm': '1.938', 'learning_rate': '4.983e-05', 'epoch': '0.2814', 'num_input_tokens_seen': 22879319, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7541', 'grad_norm': '1.46', 'learning_rate': '4.983e-05', 'epoch': '0.2814', 'num_input_tokens_seen': 22881366, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5175', 'grad_norm': '1.088', 'learning_rate': '4.983e-05', 'epoch': '0.2815', 'num_input_tokens_seen': 22883413, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.45', 'grad_norm': '2.51', 'learning_rate': '4.983e-05', 'epoch': '0.2815', 'num_input_tokens_seen': 22885460, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5133', 'grad_norm': '0.9519', 'learning_rate': '4.983e-05', 'epoch': '0.2815', 'num_input_tokens_seen': 22887507, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5792', 'grad_norm': '0.9053', 'learning_rate': '4.983e-05', 'epoch': '0.2815', 'num_input_tokens_seen': 22889554, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6201', 'grad_norm': '1.098', 'learning_rate': '4.983e-05', 'epoch': '0.2816', 'num_input_tokens_seen': 22891601, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7531', 'grad_norm': '1.403', 'learning_rate': '4.983e-05', 'epoch': '0.2816', 'num_input_tokens_seen': 22893648, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.917', 'grad_norm': '2.36', 'learning_rate': '4.983e-05', 'epoch': '0.2816', 'num_input_tokens_seen': 22895695, 'train_runtime': '1.158e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8071', 'grad_norm': '1.2', 'learning_rate': '4.983e-05', 'epoch': '0.2816', 'num_input_tokens_seen': 22897742, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.729', 'grad_norm': '1.014', 'learning_rate': '4.983e-05', 'epoch': '0.2817', 'num_input_tokens_seen': 22899789, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.471', 'grad_norm': '2.418', 'learning_rate': '4.983e-05', 'epoch': '0.2817', 'num_input_tokens_seen': 22901836, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.299', 'grad_norm': '2.389', 'learning_rate': '4.983e-05', 'epoch': '0.2817', 'num_input_tokens_seen': 22903883, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4601', 'grad_norm': '0.9848', 'learning_rate': '4.983e-05', 'epoch': '0.2817', 'num_input_tokens_seen': 22905930, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9927', 'grad_norm': '1.995', 'learning_rate': '4.983e-05', 'epoch': '0.2818', 'num_input_tokens_seen': 22907977, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.013', 'grad_norm': '2.128', 'learning_rate': '4.983e-05', 'epoch': '0.2818', 'num_input_tokens_seen': 22910024, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5954', 'grad_norm': '1.177', 'learning_rate': '4.983e-05', 'epoch': '0.2818', 'num_input_tokens_seen': 22912071, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4089', 'grad_norm': '1.107', 'learning_rate': '4.983e-05', 'epoch': '0.2818', 'num_input_tokens_seen': 22914118, 'train_runtime': '1.159e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.095', 'grad_norm': '1.837', 'learning_rate': '4.983e-05', 'epoch': '0.2819', 'num_input_tokens_seen': 22916165, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.401', 'grad_norm': '2.435', 'learning_rate': '4.983e-05', 'epoch': '0.2819', 'num_input_tokens_seen': 22918212, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6026', 'grad_norm': '1.292', 'learning_rate': '4.983e-05', 'epoch': '0.2819', 'num_input_tokens_seen': 22920259, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.231', 'grad_norm': '1.978', 'learning_rate': '4.983e-05', 'epoch': '0.2819', 'num_input_tokens_seen': 22922306, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4271', 'grad_norm': '0.8777', 'learning_rate': '4.983e-05', 'epoch': '0.282', 'num_input_tokens_seen': 22924353, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6938', 'grad_norm': '1.221', 'learning_rate': '4.983e-05', 'epoch': '0.282', 'num_input_tokens_seen': 22926400, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9861', 'grad_norm': '1.616', 'learning_rate': '4.983e-05', 'epoch': '0.282', 'num_input_tokens_seen': 22928447, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8585', 'grad_norm': '1.556', 'learning_rate': '4.983e-05', 'epoch': '0.282', 'num_input_tokens_seen': 22930494, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5306', 'grad_norm': '1.158', 'learning_rate': '4.983e-05', 'epoch': '0.2821', 'num_input_tokens_seen': 22932541, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6497', 'grad_norm': '1.653', 'learning_rate': '4.983e-05', 'epoch': '0.2821', 'num_input_tokens_seen': 22934588, 'train_runtime': '1.16e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.81', 'grad_norm': '2.585', 'learning_rate': '4.983e-05', 'epoch': '0.2821', 'num_input_tokens_seen': 22936635, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3275', 'grad_norm': '0.9728', 'learning_rate': '4.983e-05', 'epoch': '0.2821', 'num_input_tokens_seen': 22938682, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4673', 'grad_norm': '1.228', 'learning_rate': '4.983e-05', 'epoch': '0.2822', 'num_input_tokens_seen': 22940729, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.325', 'grad_norm': '0.8242', 'learning_rate': '4.983e-05', 'epoch': '0.2822', 'num_input_tokens_seen': 22942776, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8508', 'grad_norm': '1.36', 'learning_rate': '4.983e-05', 'epoch': '0.2822', 'num_input_tokens_seen': 22944823, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4537', 'grad_norm': '0.9828', 'learning_rate': '4.983e-05', 'epoch': '0.2822', 'num_input_tokens_seen': 22946870, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.054', 'grad_norm': '2.137', 'learning_rate': '4.983e-05', 'epoch': '0.2823', 'num_input_tokens_seen': 22948917, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4421', 'grad_norm': '1.032', 'learning_rate': '4.983e-05', 'epoch': '0.2823', 'num_input_tokens_seen': 22950964, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.741', 'grad_norm': '2.333', 'learning_rate': '4.983e-05', 'epoch': '0.2823', 'num_input_tokens_seen': 22953011, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.32', 'grad_norm': '2.512', 'learning_rate': '4.983e-05', 'epoch': '0.2823', 'num_input_tokens_seen': 22955058, 'train_runtime': '1.161e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8611', 'grad_norm': '1.37', 'learning_rate': '4.983e-05', 'epoch': '0.2824', 'num_input_tokens_seen': 22957105, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8503', 'grad_norm': '1.558', 'learning_rate': '4.983e-05', 'epoch': '0.2824', 'num_input_tokens_seen': 22959152, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.207', 'grad_norm': '2.772', 'learning_rate': '4.983e-05', 'epoch': '0.2824', 'num_input_tokens_seen': 22961199, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7815', 'grad_norm': '1.385', 'learning_rate': '4.983e-05', 'epoch': '0.2824', 'num_input_tokens_seen': 22963246, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8751', 'grad_norm': '1.457', 'learning_rate': '4.983e-05', 'epoch': '0.2825', 'num_input_tokens_seen': 22965293, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7082', 'grad_norm': '1.477', 'learning_rate': '4.983e-05', 'epoch': '0.2825', 'num_input_tokens_seen': 22967340, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.063', 'grad_norm': '1.576', 'learning_rate': '4.983e-05', 'epoch': '0.2825', 'num_input_tokens_seen': 22969387, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4695', 'grad_norm': '1.066', 'learning_rate': '4.983e-05', 'epoch': '0.2825', 'num_input_tokens_seen': 22971434, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5519', 'grad_norm': '1.026', 'learning_rate': '4.983e-05', 'epoch': '0.2826', 'num_input_tokens_seen': 22973481, 'train_runtime': '1.162e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4372', 'grad_norm': '1.106', 'learning_rate': '4.983e-05', 'epoch': '0.2826', 'num_input_tokens_seen': 22975528, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.188', 'grad_norm': '1.859', 'learning_rate': '4.983e-05', 'epoch': '0.2826', 'num_input_tokens_seen': 22977575, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4518', 'grad_norm': '1.147', 'learning_rate': '4.983e-05', 'epoch': '0.2826', 'num_input_tokens_seen': 22979622, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5416', 'grad_norm': '1.21', 'learning_rate': '4.983e-05', 'epoch': '0.2827', 'num_input_tokens_seen': 22981669, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2596', 'grad_norm': '0.9065', 'learning_rate': '4.983e-05', 'epoch': '0.2827', 'num_input_tokens_seen': 22983716, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2739', 'grad_norm': '0.8614', 'learning_rate': '4.983e-05', 'epoch': '0.2827', 'num_input_tokens_seen': 22985763, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8108', 'grad_norm': '1.589', 'learning_rate': '4.983e-05', 'epoch': '0.2828', 'num_input_tokens_seen': 22987810, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.642', 'grad_norm': '2.902', 'learning_rate': '4.983e-05', 'epoch': '0.2828', 'num_input_tokens_seen': 22989857, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8065', 'grad_norm': '1.478', 'learning_rate': '4.983e-05', 'epoch': '0.2828', 'num_input_tokens_seen': 22991904, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5895', 'grad_norm': '1.029', 'learning_rate': '4.983e-05', 'epoch': '0.2828', 'num_input_tokens_seen': 22993951, 'train_runtime': '1.163e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.524', 'grad_norm': '2.842', 'learning_rate': '4.983e-05', 'epoch': '0.2829', 'num_input_tokens_seen': 22995998, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4649', 'grad_norm': '1.12', 'learning_rate': '4.983e-05', 'epoch': '0.2829', 'num_input_tokens_seen': 22998045, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4125', 'grad_norm': '1.075', 'learning_rate': '4.983e-05', 'epoch': '0.2829', 'num_input_tokens_seen': 23000092, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2672', 'grad_norm': '0.856', 'learning_rate': '4.983e-05', 'epoch': '0.2829', 'num_input_tokens_seen': 23002139, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3468', 'grad_norm': '0.8762', 'learning_rate': '4.983e-05', 'epoch': '0.283', 'num_input_tokens_seen': 23004186, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4663', 'grad_norm': '1.261', 'learning_rate': '4.983e-05', 'epoch': '0.283', 'num_input_tokens_seen': 23006233, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7723', 'grad_norm': '1.223', 'learning_rate': '4.983e-05', 'epoch': '0.283', 'num_input_tokens_seen': 23008280, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.198', 'grad_norm': '1.999', 'learning_rate': '4.983e-05', 'epoch': '0.283', 'num_input_tokens_seen': 23010327, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6761', 'grad_norm': '1.307', 'learning_rate': '4.983e-05', 'epoch': '0.2831', 'num_input_tokens_seen': 23012374, 'train_runtime': '1.164e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4003', 'grad_norm': '0.8773', 'learning_rate': '4.983e-05', 'epoch': '0.2831', 'num_input_tokens_seen': 23014421, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7804', 'grad_norm': '1.304', 'learning_rate': '4.983e-05', 'epoch': '0.2831', 'num_input_tokens_seen': 23016468, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4415', 'grad_norm': '1.036', 'learning_rate': '4.983e-05', 'epoch': '0.2831', 'num_input_tokens_seen': 23018515, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.448', 'grad_norm': '2.839', 'learning_rate': '4.983e-05', 'epoch': '0.2832', 'num_input_tokens_seen': 23020562, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4355', 'grad_norm': '1.005', 'learning_rate': '4.983e-05', 'epoch': '0.2832', 'num_input_tokens_seen': 23022609, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2751', 'grad_norm': '0.979', 'learning_rate': '4.983e-05', 'epoch': '0.2832', 'num_input_tokens_seen': 23024656, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8621', 'grad_norm': '1.371', 'learning_rate': '4.983e-05', 'epoch': '0.2832', 'num_input_tokens_seen': 23026703, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7289', 'grad_norm': '1.189', 'learning_rate': '4.983e-05', 'epoch': '0.2833', 'num_input_tokens_seen': 23028750, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6577', 'grad_norm': '1.126', 'learning_rate': '4.983e-05', 'epoch': '0.2833', 'num_input_tokens_seen': 23030797, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4937', 'grad_norm': '1.165', 'learning_rate': '4.983e-05', 'epoch': '0.2833', 'num_input_tokens_seen': 23032844, 'train_runtime': '1.165e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3676', 'grad_norm': '1.028', 'learning_rate': '4.983e-05', 'epoch': '0.2833', 'num_input_tokens_seen': 23034891, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8606', 'grad_norm': '1.374', 'learning_rate': '4.983e-05', 'epoch': '0.2834', 'num_input_tokens_seen': 23036938, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5827', 'grad_norm': '1.017', 'learning_rate': '4.983e-05', 'epoch': '0.2834', 'num_input_tokens_seen': 23038985, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2021', 'grad_norm': '0.8457', 'learning_rate': '4.983e-05', 'epoch': '0.2834', 'num_input_tokens_seen': 23041032, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9597', 'grad_norm': '1.988', 'learning_rate': '4.983e-05', 'epoch': '0.2834', 'num_input_tokens_seen': 23043079, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2722', 'grad_norm': '0.8329', 'learning_rate': '4.983e-05', 'epoch': '0.2835', 'num_input_tokens_seen': 23045126, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1656', 'grad_norm': '0.8031', 'learning_rate': '4.983e-05', 'epoch': '0.2835', 'num_input_tokens_seen': 23047173, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2977', 'grad_norm': '0.8574', 'learning_rate': '4.983e-05', 'epoch': '0.2835', 'num_input_tokens_seen': 23049220, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8106', 'grad_norm': '1.35', 'learning_rate': '4.983e-05', 'epoch': '0.2835', 'num_input_tokens_seen': 23051267, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3655', 'grad_norm': '0.7799', 'learning_rate': '4.983e-05', 'epoch': '0.2836', 'num_input_tokens_seen': 23053314, 'train_runtime': '1.166e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5383', 'grad_norm': '1.07', 'learning_rate': '4.983e-05', 'epoch': '0.2836', 'num_input_tokens_seen': 23055361, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2325', 'grad_norm': '0.8162', 'learning_rate': '4.983e-05', 'epoch': '0.2836', 'num_input_tokens_seen': 23057408, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2668', 'grad_norm': '1.21', 'learning_rate': '4.983e-05', 'epoch': '0.2836', 'num_input_tokens_seen': 23059455, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4712', 'grad_norm': '0.9822', 'learning_rate': '4.983e-05', 'epoch': '0.2837', 'num_input_tokens_seen': 23061502, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3915', 'grad_norm': '0.8093', 'learning_rate': '4.983e-05', 'epoch': '0.2837', 'num_input_tokens_seen': 23063549, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3205', 'grad_norm': '0.7799', 'learning_rate': '4.983e-05', 'epoch': '0.2837', 'num_input_tokens_seen': 23065596, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8636', 'grad_norm': '1.638', 'learning_rate': '4.983e-05', 'epoch': '0.2837', 'num_input_tokens_seen': 23067643, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.885', 'grad_norm': '1.656', 'learning_rate': '4.983e-05', 'epoch': '0.2838', 'num_input_tokens_seen': 23069690, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.897', 'grad_norm': '3.738', 'learning_rate': '4.983e-05', 'epoch': '0.2838', 'num_input_tokens_seen': 23071737, 'train_runtime': '1.167e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7793', 'grad_norm': '1.388', 'learning_rate': '4.983e-05', 'epoch': '0.2838', 'num_input_tokens_seen': 23073784, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3654', 'grad_norm': '0.8644', 'learning_rate': '4.983e-05', 'epoch': '0.2838', 'num_input_tokens_seen': 23075831, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.348', 'grad_norm': '0.9274', 'learning_rate': '4.983e-05', 'epoch': '0.2839', 'num_input_tokens_seen': 23077878, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9717', 'grad_norm': '2.333', 'learning_rate': '4.983e-05', 'epoch': '0.2839', 'num_input_tokens_seen': 23079925, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.585', 'grad_norm': '2.028', 'learning_rate': '4.983e-05', 'epoch': '0.2839', 'num_input_tokens_seen': 23081972, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4263', 'grad_norm': '0.9456', 'learning_rate': '4.983e-05', 'epoch': '0.2839', 'num_input_tokens_seen': 23084019, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3491', 'grad_norm': '0.8457', 'learning_rate': '4.983e-05', 'epoch': '0.284', 'num_input_tokens_seen': 23086066, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.372', 'grad_norm': '2.392', 'learning_rate': '4.983e-05', 'epoch': '0.284', 'num_input_tokens_seen': 23088113, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6022', 'grad_norm': '0.7961', 'learning_rate': '4.983e-05', 'epoch': '0.284', 'num_input_tokens_seen': 23090160, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5814', 'grad_norm': '0.8419', 'learning_rate': '4.983e-05', 'epoch': '0.284', 'num_input_tokens_seen': 23092207, 'train_runtime': '1.168e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.642', 'grad_norm': '1.092', 'learning_rate': '4.983e-05', 'epoch': '0.2841', 'num_input_tokens_seen': 23094254, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2495', 'grad_norm': '0.8341', 'learning_rate': '4.983e-05', 'epoch': '0.2841', 'num_input_tokens_seen': 23096301, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3533', 'grad_norm': '0.9881', 'learning_rate': '4.983e-05', 'epoch': '0.2841', 'num_input_tokens_seen': 23098348, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8288', 'grad_norm': '1.211', 'learning_rate': '4.983e-05', 'epoch': '0.2841', 'num_input_tokens_seen': 23100395, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9533', 'grad_norm': '1.587', 'learning_rate': '4.983e-05', 'epoch': '0.2842', 'num_input_tokens_seen': 23102442, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3717', 'grad_norm': '0.9773', 'learning_rate': '4.983e-05', 'epoch': '0.2842', 'num_input_tokens_seen': 23104489, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4567', 'grad_norm': '1.004', 'learning_rate': '4.983e-05', 'epoch': '0.2842', 'num_input_tokens_seen': 23106536, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3695', 'grad_norm': '0.8687', 'learning_rate': '4.983e-05', 'epoch': '0.2842', 'num_input_tokens_seen': 23108583, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8758', 'grad_norm': '1.423', 'learning_rate': '4.983e-05', 'epoch': '0.2843', 'num_input_tokens_seen': 23110630, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.773', 'grad_norm': '1.424', 'learning_rate': '4.983e-05', 'epoch': '0.2843', 'num_input_tokens_seen': 23112677, 'train_runtime': '1.169e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.089', 'grad_norm': '1.709', 'learning_rate': '4.983e-05', 'epoch': '0.2843', 'num_input_tokens_seen': 23114724, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8678', 'grad_norm': '1.536', 'learning_rate': '4.983e-05', 'epoch': '0.2843', 'num_input_tokens_seen': 23116771, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9764', 'grad_norm': '1.47', 'learning_rate': '4.983e-05', 'epoch': '0.2844', 'num_input_tokens_seen': 23118818, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5229', 'grad_norm': '1.112', 'learning_rate': '4.983e-05', 'epoch': '0.2844', 'num_input_tokens_seen': 23120865, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5156', 'grad_norm': '1.089', 'learning_rate': '4.983e-05', 'epoch': '0.2844', 'num_input_tokens_seen': 23122912, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.731', 'grad_norm': '2.349', 'learning_rate': '4.983e-05', 'epoch': '0.2844', 'num_input_tokens_seen': 23124959, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.047', 'grad_norm': '1.908', 'learning_rate': '4.983e-05', 'epoch': '0.2845', 'num_input_tokens_seen': 23127006, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2477', 'grad_norm': '0.8815', 'learning_rate': '4.983e-05', 'epoch': '0.2845', 'num_input_tokens_seen': 23129053, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.145', 'grad_norm': '2.212', 'learning_rate': '4.983e-05', 'epoch': '0.2845', 'num_input_tokens_seen': 23131100, 'train_runtime': '1.17e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.471', 'grad_norm': '2.095', 'learning_rate': '4.983e-05', 'epoch': '0.2845', 'num_input_tokens_seen': 23133147, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2225', 'grad_norm': '0.7555', 'learning_rate': '4.983e-05', 'epoch': '0.2846', 'num_input_tokens_seen': 23135194, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.438', 'grad_norm': '1.036', 'learning_rate': '4.983e-05', 'epoch': '0.2846', 'num_input_tokens_seen': 23137241, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.669', 'grad_norm': '2.428', 'learning_rate': '4.983e-05', 'epoch': '0.2846', 'num_input_tokens_seen': 23139288, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2305', 'grad_norm': '0.8458', 'learning_rate': '4.983e-05', 'epoch': '0.2846', 'num_input_tokens_seen': 23141335, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4914', 'grad_norm': '1.207', 'learning_rate': '4.983e-05', 'epoch': '0.2847', 'num_input_tokens_seen': 23143382, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.444', 'grad_norm': '2.067', 'learning_rate': '4.982e-05', 'epoch': '0.2847', 'num_input_tokens_seen': 23145429, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7062', 'grad_norm': '1.112', 'learning_rate': '4.982e-05', 'epoch': '0.2847', 'num_input_tokens_seen': 23147476, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4097', 'grad_norm': '1.011', 'learning_rate': '4.982e-05', 'epoch': '0.2847', 'num_input_tokens_seen': 23149523, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.128', 'grad_norm': '1.175', 'learning_rate': '4.982e-05', 'epoch': '0.2848', 'num_input_tokens_seen': 23151570, 'train_runtime': '1.171e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5748', 'grad_norm': '1.307', 'learning_rate': '4.982e-05', 'epoch': '0.2848', 'num_input_tokens_seen': 23153617, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.365', 'grad_norm': '2.172', 'learning_rate': '4.982e-05', 'epoch': '0.2848', 'num_input_tokens_seen': 23155664, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5409', 'grad_norm': '1.219', 'learning_rate': '4.982e-05', 'epoch': '0.2848', 'num_input_tokens_seen': 23157711, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5549', 'grad_norm': '1.121', 'learning_rate': '4.982e-05', 'epoch': '0.2849', 'num_input_tokens_seen': 23159758, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8454', 'grad_norm': '1.214', 'learning_rate': '4.982e-05', 'epoch': '0.2849', 'num_input_tokens_seen': 23161805, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2897', 'grad_norm': '0.9502', 'learning_rate': '4.982e-05', 'epoch': '0.2849', 'num_input_tokens_seen': 23163852, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6639', 'grad_norm': '1.24', 'learning_rate': '4.982e-05', 'epoch': '0.2849', 'num_input_tokens_seen': 23165899, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.669', 'grad_norm': '2.816', 'learning_rate': '4.982e-05', 'epoch': '0.285', 'num_input_tokens_seen': 23167946, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6037', 'grad_norm': '1.178', 'learning_rate': '4.982e-05', 'epoch': '0.285', 'num_input_tokens_seen': 23169993, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7422', 'grad_norm': '1.345', 'learning_rate': '4.982e-05', 'epoch': '0.285', 'num_input_tokens_seen': 23172040, 'train_runtime': '1.172e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.883', 'grad_norm': '1.77', 'learning_rate': '4.982e-05', 'epoch': '0.285', 'num_input_tokens_seen': 23174087, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5902', 'grad_norm': '1.207', 'learning_rate': '4.982e-05', 'epoch': '0.2851', 'num_input_tokens_seen': 23176134, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.089', 'grad_norm': '2.332', 'learning_rate': '4.982e-05', 'epoch': '0.2851', 'num_input_tokens_seen': 23178181, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.053', 'grad_norm': '1.417', 'learning_rate': '4.982e-05', 'epoch': '0.2851', 'num_input_tokens_seen': 23180228, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8388', 'grad_norm': '1.237', 'learning_rate': '4.982e-05', 'epoch': '0.2851', 'num_input_tokens_seen': 23182275, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8511', 'grad_norm': '1.744', 'learning_rate': '4.982e-05', 'epoch': '0.2852', 'num_input_tokens_seen': 23184322, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.127', 'grad_norm': '1.864', 'learning_rate': '4.982e-05', 'epoch': '0.2852', 'num_input_tokens_seen': 23186369, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.446', 'grad_norm': '2.152', 'learning_rate': '4.982e-05', 'epoch': '0.2852', 'num_input_tokens_seen': 23188416, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8393', 'grad_norm': '1.402', 'learning_rate': '4.982e-05', 'epoch': '0.2852', 'num_input_tokens_seen': 23190463, 'train_runtime': '1.173e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9198', 'grad_norm': '1.2', 'learning_rate': '4.982e-05', 'epoch': '0.2853', 'num_input_tokens_seen': 23192510, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4521', 'grad_norm': '1.048', 'learning_rate': '4.982e-05', 'epoch': '0.2853', 'num_input_tokens_seen': 23194557, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7193', 'grad_norm': '1.333', 'learning_rate': '4.982e-05', 'epoch': '0.2853', 'num_input_tokens_seen': 23196604, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2752', 'grad_norm': '0.9624', 'learning_rate': '4.982e-05', 'epoch': '0.2853', 'num_input_tokens_seen': 23198651, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5971', 'grad_norm': '1.274', 'learning_rate': '4.982e-05', 'epoch': '0.2854', 'num_input_tokens_seen': 23200698, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3603', 'grad_norm': '0.9379', 'learning_rate': '4.982e-05', 'epoch': '0.2854', 'num_input_tokens_seen': 23202745, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3029', 'grad_norm': '1.025', 'learning_rate': '4.982e-05', 'epoch': '0.2854', 'num_input_tokens_seen': 23204792, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3017', 'grad_norm': '0.8278', 'learning_rate': '4.982e-05', 'epoch': '0.2854', 'num_input_tokens_seen': 23206839, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7179', 'grad_norm': '1.081', 'learning_rate': '4.982e-05', 'epoch': '0.2855', 'num_input_tokens_seen': 23208886, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.067', 'grad_norm': '1.112', 'learning_rate': '4.982e-05', 'epoch': '0.2855', 'num_input_tokens_seen': 23210933, 'train_runtime': '1.174e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4414', 'grad_norm': '0.9416', 'learning_rate': '4.982e-05', 'epoch': '0.2855', 'num_input_tokens_seen': 23212980, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5596', 'grad_norm': '1.393', 'learning_rate': '4.982e-05', 'epoch': '0.2855', 'num_input_tokens_seen': 23215027, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.484', 'grad_norm': '2.411', 'learning_rate': '4.982e-05', 'epoch': '0.2856', 'num_input_tokens_seen': 23217074, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2861', 'grad_norm': '0.9968', 'learning_rate': '4.982e-05', 'epoch': '0.2856', 'num_input_tokens_seen': 23219121, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3524', 'grad_norm': '0.8245', 'learning_rate': '4.982e-05', 'epoch': '0.2856', 'num_input_tokens_seen': 23221168, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2119', 'grad_norm': '0.8296', 'learning_rate': '4.982e-05', 'epoch': '0.2856', 'num_input_tokens_seen': 23223215, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3976', 'grad_norm': '0.9653', 'learning_rate': '4.982e-05', 'epoch': '0.2857', 'num_input_tokens_seen': 23225262, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.073', 'grad_norm': '1.455', 'learning_rate': '4.982e-05', 'epoch': '0.2857', 'num_input_tokens_seen': 23227309, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.753', 'grad_norm': '2.395', 'learning_rate': '4.982e-05', 'epoch': '0.2857', 'num_input_tokens_seen': 23229356, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.294', 'grad_norm': '0.9687', 'learning_rate': '4.982e-05', 'epoch': '0.2857', 'num_input_tokens_seen': 23231403, 'train_runtime': '1.175e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1624', 'grad_norm': '0.8632', 'learning_rate': '4.982e-05', 'epoch': '0.2858', 'num_input_tokens_seen': 23233450, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6571', 'grad_norm': '1.549', 'learning_rate': '4.982e-05', 'epoch': '0.2858', 'num_input_tokens_seen': 23235497, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6245', 'grad_norm': '1.512', 'learning_rate': '4.982e-05', 'epoch': '0.2858', 'num_input_tokens_seen': 23237544, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.416', 'grad_norm': '2.141', 'learning_rate': '4.982e-05', 'epoch': '0.2858', 'num_input_tokens_seen': 23239591, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.044', 'grad_norm': '1.222', 'learning_rate': '4.982e-05', 'epoch': '0.2859', 'num_input_tokens_seen': 23241638, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2366', 'grad_norm': '0.7937', 'learning_rate': '4.982e-05', 'epoch': '0.2859', 'num_input_tokens_seen': 23243685, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5998', 'grad_norm': '1.664', 'learning_rate': '4.982e-05', 'epoch': '0.2859', 'num_input_tokens_seen': 23245732, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6917', 'grad_norm': '1.02', 'learning_rate': '4.982e-05', 'epoch': '0.2859', 'num_input_tokens_seen': 23247779, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5089', 'grad_norm': '1.527', 'learning_rate': '4.982e-05', 'epoch': '0.286', 'num_input_tokens_seen': 23249826, 'train_runtime': '1.176e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6726', 'grad_norm': '1.195', 'learning_rate': '4.982e-05', 'epoch': '0.286', 'num_input_tokens_seen': 23251873, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.662', 'grad_norm': '1.494', 'learning_rate': '4.982e-05', 'epoch': '0.286', 'num_input_tokens_seen': 23253920, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3302', 'grad_norm': '0.9411', 'learning_rate': '4.982e-05', 'epoch': '0.286', 'num_input_tokens_seen': 23255967, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5782', 'grad_norm': '0.9721', 'learning_rate': '4.982e-05', 'epoch': '0.2861', 'num_input_tokens_seen': 23258014, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.48', 'grad_norm': '2.565', 'learning_rate': '4.982e-05', 'epoch': '0.2861', 'num_input_tokens_seen': 23260061, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.755', 'grad_norm': '1.345', 'learning_rate': '4.982e-05', 'epoch': '0.2861', 'num_input_tokens_seen': 23262108, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8612', 'grad_norm': '1.632', 'learning_rate': '4.982e-05', 'epoch': '0.2861', 'num_input_tokens_seen': 23264155, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.949', 'grad_norm': '1.954', 'learning_rate': '4.982e-05', 'epoch': '0.2862', 'num_input_tokens_seen': 23266202, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2312', 'grad_norm': '0.8768', 'learning_rate': '4.982e-05', 'epoch': '0.2862', 'num_input_tokens_seen': 23268249, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2963', 'grad_norm': '0.8496', 'learning_rate': '4.982e-05', 'epoch': '0.2862', 'num_input_tokens_seen': 23270296, 'train_runtime': '1.177e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4904', 'grad_norm': '1.188', 'learning_rate': '4.982e-05', 'epoch': '0.2863', 'num_input_tokens_seen': 23272343, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4692', 'grad_norm': '1.418', 'learning_rate': '4.982e-05', 'epoch': '0.2863', 'num_input_tokens_seen': 23274390, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4206', 'grad_norm': '0.8535', 'learning_rate': '4.982e-05', 'epoch': '0.2863', 'num_input_tokens_seen': 23276437, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9844', 'grad_norm': '1.724', 'learning_rate': '4.982e-05', 'epoch': '0.2863', 'num_input_tokens_seen': 23278484, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.217', 'grad_norm': '2.303', 'learning_rate': '4.982e-05', 'epoch': '0.2864', 'num_input_tokens_seen': 23280531, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.713', 'grad_norm': '1.359', 'learning_rate': '4.982e-05', 'epoch': '0.2864', 'num_input_tokens_seen': 23282578, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4312', 'grad_norm': '0.7883', 'learning_rate': '4.982e-05', 'epoch': '0.2864', 'num_input_tokens_seen': 23284625, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5923', 'grad_norm': '1.31', 'learning_rate': '4.982e-05', 'epoch': '0.2864', 'num_input_tokens_seen': 23286672, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5711', 'grad_norm': '1.099', 'learning_rate': '4.982e-05', 'epoch': '0.2865', 'num_input_tokens_seen': 23288719, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6699', 'grad_norm': '1.32', 'learning_rate': '4.982e-05', 'epoch': '0.2865', 'num_input_tokens_seen': 23290766, 'train_runtime': '1.178e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2653', 'grad_norm': '0.7906', 'learning_rate': '4.982e-05', 'epoch': '0.2865', 'num_input_tokens_seen': 23292813, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3538', 'grad_norm': '0.7915', 'learning_rate': '4.982e-05', 'epoch': '0.2865', 'num_input_tokens_seen': 23294860, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4565', 'grad_norm': '0.9364', 'learning_rate': '4.982e-05', 'epoch': '0.2866', 'num_input_tokens_seen': 23296907, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.438', 'grad_norm': '1.382', 'learning_rate': '4.982e-05', 'epoch': '0.2866', 'num_input_tokens_seen': 23298954, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2903', 'grad_norm': '0.8654', 'learning_rate': '4.982e-05', 'epoch': '0.2866', 'num_input_tokens_seen': 23301001, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.081', 'grad_norm': '1.945', 'learning_rate': '4.982e-05', 'epoch': '0.2866', 'num_input_tokens_seen': 23303048, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4185', 'grad_norm': '1.106', 'learning_rate': '4.982e-05', 'epoch': '0.2867', 'num_input_tokens_seen': 23305095, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3168', 'grad_norm': '0.7816', 'learning_rate': '4.982e-05', 'epoch': '0.2867', 'num_input_tokens_seen': 23307142, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9642', 'grad_norm': '1.54', 'learning_rate': '4.982e-05', 'epoch': '0.2867', 'num_input_tokens_seen': 23309189, 'train_runtime': '1.179e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7028', 'grad_norm': '1.193', 'learning_rate': '4.982e-05', 'epoch': '0.2867', 'num_input_tokens_seen': 23311236, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.122', 'grad_norm': '2.049', 'learning_rate': '4.982e-05', 'epoch': '0.2868', 'num_input_tokens_seen': 23313283, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6277', 'grad_norm': '1.344', 'learning_rate': '4.982e-05', 'epoch': '0.2868', 'num_input_tokens_seen': 23315330, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.494', 'grad_norm': '2.563', 'learning_rate': '4.982e-05', 'epoch': '0.2868', 'num_input_tokens_seen': 23317377, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.726', 'grad_norm': '1.169', 'learning_rate': '4.982e-05', 'epoch': '0.2868', 'num_input_tokens_seen': 23319424, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9397', 'grad_norm': '1.92', 'learning_rate': '4.982e-05', 'epoch': '0.2869', 'num_input_tokens_seen': 23321471, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.049', 'grad_norm': '1.677', 'learning_rate': '4.982e-05', 'epoch': '0.2869', 'num_input_tokens_seen': 23323518, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3302', 'grad_norm': '0.8724', 'learning_rate': '4.982e-05', 'epoch': '0.2869', 'num_input_tokens_seen': 23325565, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3032', 'grad_norm': '0.8045', 'learning_rate': '4.982e-05', 'epoch': '0.2869', 'num_input_tokens_seen': 23327612, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9513', 'grad_norm': '1.847', 'learning_rate': '4.982e-05', 'epoch': '0.287', 'num_input_tokens_seen': 23329659, 'train_runtime': '1.18e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6926', 'grad_norm': '1.36', 'learning_rate': '4.982e-05', 'epoch': '0.287', 'num_input_tokens_seen': 23331706, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4569', 'grad_norm': '1.042', 'learning_rate': '4.982e-05', 'epoch': '0.287', 'num_input_tokens_seen': 23333753, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4007', 'grad_norm': '0.9743', 'learning_rate': '4.982e-05', 'epoch': '0.287', 'num_input_tokens_seen': 23335800, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6557', 'grad_norm': '1.411', 'learning_rate': '4.982e-05', 'epoch': '0.2871', 'num_input_tokens_seen': 23337847, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.384', 'grad_norm': '0.8965', 'learning_rate': '4.982e-05', 'epoch': '0.2871', 'num_input_tokens_seen': 23339894, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2696', 'grad_norm': '0.7946', 'learning_rate': '4.982e-05', 'epoch': '0.2871', 'num_input_tokens_seen': 23341941, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4751', 'grad_norm': '1.088', 'learning_rate': '4.982e-05', 'epoch': '0.2871', 'num_input_tokens_seen': 23343988, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.08', 'grad_norm': '1.629', 'learning_rate': '4.982e-05', 'epoch': '0.2872', 'num_input_tokens_seen': 23346035, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.643', 'grad_norm': '2.355', 'learning_rate': '4.982e-05', 'epoch': '0.2872', 'num_input_tokens_seen': 23348082, 'train_runtime': '1.181e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6913', 'grad_norm': '0.8874', 'learning_rate': '4.982e-05', 'epoch': '0.2872', 'num_input_tokens_seen': 23350129, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7297', 'grad_norm': '1.296', 'learning_rate': '4.982e-05', 'epoch': '0.2872', 'num_input_tokens_seen': 23352176, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4643', 'grad_norm': '1.303', 'learning_rate': '4.982e-05', 'epoch': '0.2873', 'num_input_tokens_seen': 23354223, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.025', 'grad_norm': '2.933', 'learning_rate': '4.982e-05', 'epoch': '0.2873', 'num_input_tokens_seen': 23356270, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.183', 'grad_norm': '1.635', 'learning_rate': '4.982e-05', 'epoch': '0.2873', 'num_input_tokens_seen': 23358317, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.039', 'grad_norm': '2.691', 'learning_rate': '4.982e-05', 'epoch': '0.2873', 'num_input_tokens_seen': 23360364, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7081', 'grad_norm': '1.15', 'learning_rate': '4.982e-05', 'epoch': '0.2874', 'num_input_tokens_seen': 23362411, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5104', 'grad_norm': '1.237', 'learning_rate': '4.982e-05', 'epoch': '0.2874', 'num_input_tokens_seen': 23364458, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5398', 'grad_norm': '1.078', 'learning_rate': '4.982e-05', 'epoch': '0.2874', 'num_input_tokens_seen': 23366505, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.078', 'grad_norm': '2.745', 'learning_rate': '4.982e-05', 'epoch': '0.2874', 'num_input_tokens_seen': 23368552, 'train_runtime': '1.182e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.033', 'grad_norm': '1.655', 'learning_rate': '4.982e-05', 'epoch': '0.2875', 'num_input_tokens_seen': 23370599, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3451', 'grad_norm': '0.8659', 'learning_rate': '4.982e-05', 'epoch': '0.2875', 'num_input_tokens_seen': 23372646, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8098', 'grad_norm': '1.366', 'learning_rate': '4.982e-05', 'epoch': '0.2875', 'num_input_tokens_seen': 23374693, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3924', 'grad_norm': '0.8481', 'learning_rate': '4.982e-05', 'epoch': '0.2875', 'num_input_tokens_seen': 23376740, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.374', 'grad_norm': '0.974', 'learning_rate': '4.982e-05', 'epoch': '0.2876', 'num_input_tokens_seen': 23378787, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8832', 'grad_norm': '1.408', 'learning_rate': '4.982e-05', 'epoch': '0.2876', 'num_input_tokens_seen': 23380834, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.703', 'grad_norm': '1.367', 'learning_rate': '4.982e-05', 'epoch': '0.2876', 'num_input_tokens_seen': 23382881, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.952', 'grad_norm': '2.852', 'learning_rate': '4.982e-05', 'epoch': '0.2876', 'num_input_tokens_seen': 23384928, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6446', 'grad_norm': '1.123', 'learning_rate': '4.982e-05', 'epoch': '0.2877', 'num_input_tokens_seen': 23386975, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3472', 'grad_norm': '0.8048', 'learning_rate': '4.982e-05', 'epoch': '0.2877', 'num_input_tokens_seen': 23389022, 'train_runtime': '1.183e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.696', 'grad_norm': '1.391', 'learning_rate': '4.982e-05', 'epoch': '0.2877', 'num_input_tokens_seen': 23391069, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.095', 'grad_norm': '3.225', 'learning_rate': '4.982e-05', 'epoch': '0.2877', 'num_input_tokens_seen': 23393116, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3509', 'grad_norm': '0.8422', 'learning_rate': '4.982e-05', 'epoch': '0.2878', 'num_input_tokens_seen': 23395163, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3991', 'grad_norm': '1.309', 'learning_rate': '4.982e-05', 'epoch': '0.2878', 'num_input_tokens_seen': 23397210, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7415', 'grad_norm': '1.228', 'learning_rate': '4.982e-05', 'epoch': '0.2878', 'num_input_tokens_seen': 23399257, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5994', 'grad_norm': '0.9746', 'learning_rate': '4.982e-05', 'epoch': '0.2878', 'num_input_tokens_seen': 23401304, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6727', 'grad_norm': '1.103', 'learning_rate': '4.982e-05', 'epoch': '0.2879', 'num_input_tokens_seen': 23403351, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4311', 'grad_norm': '0.942', 'learning_rate': '4.982e-05', 'epoch': '0.2879', 'num_input_tokens_seen': 23405398, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3331', 'grad_norm': '1.126', 'learning_rate': '4.982e-05', 'epoch': '0.2879', 'num_input_tokens_seen': 23407445, 'train_runtime': '1.184e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.406', 'grad_norm': '0.8874', 'learning_rate': '4.982e-05', 'epoch': '0.2879', 'num_input_tokens_seen': 23409492, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3397', 'grad_norm': '0.9199', 'learning_rate': '4.982e-05', 'epoch': '0.288', 'num_input_tokens_seen': 23411539, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.814', 'grad_norm': '2.844', 'learning_rate': '4.982e-05', 'epoch': '0.288', 'num_input_tokens_seen': 23413586, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.045', 'grad_norm': '1.803', 'learning_rate': '4.982e-05', 'epoch': '0.288', 'num_input_tokens_seen': 23415633, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6712', 'grad_norm': '1.273', 'learning_rate': '4.982e-05', 'epoch': '0.288', 'num_input_tokens_seen': 23417680, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7211', 'grad_norm': '1.615', 'learning_rate': '4.982e-05', 'epoch': '0.2881', 'num_input_tokens_seen': 23419727, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2693', 'grad_norm': '0.9525', 'learning_rate': '4.982e-05', 'epoch': '0.2881', 'num_input_tokens_seen': 23421774, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.418', 'grad_norm': '1.136', 'learning_rate': '4.982e-05', 'epoch': '0.2881', 'num_input_tokens_seen': 23423821, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.766', 'grad_norm': '1.232', 'learning_rate': '4.982e-05', 'epoch': '0.2881', 'num_input_tokens_seen': 23425868, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3252', 'grad_norm': '0.7408', 'learning_rate': '4.982e-05', 'epoch': '0.2882', 'num_input_tokens_seen': 23427915, 'train_runtime': '1.185e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.085', 'grad_norm': '1.838', 'learning_rate': '4.982e-05', 'epoch': '0.2882', 'num_input_tokens_seen': 23429962, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.394', 'grad_norm': '2.317', 'learning_rate': '4.982e-05', 'epoch': '0.2882', 'num_input_tokens_seen': 23432009, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4392', 'grad_norm': '0.9476', 'learning_rate': '4.982e-05', 'epoch': '0.2882', 'num_input_tokens_seen': 23434056, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.034', 'grad_norm': '1.825', 'learning_rate': '4.982e-05', 'epoch': '0.2883', 'num_input_tokens_seen': 23436103, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9972', 'grad_norm': '1.057', 'learning_rate': '4.982e-05', 'epoch': '0.2883', 'num_input_tokens_seen': 23438150, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6411', 'grad_norm': '0.8778', 'learning_rate': '4.982e-05', 'epoch': '0.2883', 'num_input_tokens_seen': 23440197, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.441', 'grad_norm': '2.078', 'learning_rate': '4.982e-05', 'epoch': '0.2883', 'num_input_tokens_seen': 23442244, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8234', 'grad_norm': '1.836', 'learning_rate': '4.982e-05', 'epoch': '0.2884', 'num_input_tokens_seen': 23444291, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2472', 'grad_norm': '0.874', 'learning_rate': '4.982e-05', 'epoch': '0.2884', 'num_input_tokens_seen': 23446338, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.247', 'grad_norm': '2.31', 'learning_rate': '4.982e-05', 'epoch': '0.2884', 'num_input_tokens_seen': 23448385, 'train_runtime': '1.186e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4918', 'grad_norm': '1.098', 'learning_rate': '4.982e-05', 'epoch': '0.2884', 'num_input_tokens_seen': 23450432, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6829', 'grad_norm': '1.418', 'learning_rate': '4.982e-05', 'epoch': '0.2885', 'num_input_tokens_seen': 23452479, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4623', 'grad_norm': '1.429', 'learning_rate': '4.982e-05', 'epoch': '0.2885', 'num_input_tokens_seen': 23454526, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5745', 'grad_norm': '0.923', 'learning_rate': '4.982e-05', 'epoch': '0.2885', 'num_input_tokens_seen': 23456573, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.657', 'grad_norm': '2.98', 'learning_rate': '4.982e-05', 'epoch': '0.2885', 'num_input_tokens_seen': 23458620, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.055', 'grad_norm': '1.478', 'learning_rate': '4.982e-05', 'epoch': '0.2886', 'num_input_tokens_seen': 23460667, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6427', 'grad_norm': '1.234', 'learning_rate': '4.982e-05', 'epoch': '0.2886', 'num_input_tokens_seen': 23462714, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5621', 'grad_norm': '1.448', 'learning_rate': '4.982e-05', 'epoch': '0.2886', 'num_input_tokens_seen': 23464761, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7208', 'grad_norm': '1.337', 'learning_rate': '4.982e-05', 'epoch': '0.2886', 'num_input_tokens_seen': 23466808, 'train_runtime': '1.187e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4256', 'grad_norm': '1.246', 'learning_rate': '4.982e-05', 'epoch': '0.2887', 'num_input_tokens_seen': 23468855, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.4', 'grad_norm': '2.391', 'learning_rate': '4.982e-05', 'epoch': '0.2887', 'num_input_tokens_seen': 23470902, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.239', 'grad_norm': '2.147', 'learning_rate': '4.982e-05', 'epoch': '0.2887', 'num_input_tokens_seen': 23472949, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7458', 'grad_norm': '1.639', 'learning_rate': '4.982e-05', 'epoch': '0.2887', 'num_input_tokens_seen': 23474996, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7087', 'grad_norm': '1.481', 'learning_rate': '4.982e-05', 'epoch': '0.2888', 'num_input_tokens_seen': 23477043, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3332', 'grad_norm': '0.732', 'learning_rate': '4.982e-05', 'epoch': '0.2888', 'num_input_tokens_seen': 23479090, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3954', 'grad_norm': '0.9578', 'learning_rate': '4.982e-05', 'epoch': '0.2888', 'num_input_tokens_seen': 23481137, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4473', 'grad_norm': '1.003', 'learning_rate': '4.982e-05', 'epoch': '0.2888', 'num_input_tokens_seen': 23483184, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4283', 'grad_norm': '1.32', 'learning_rate': '4.982e-05', 'epoch': '0.2889', 'num_input_tokens_seen': 23485231, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.555', 'grad_norm': '2.068', 'learning_rate': '4.982e-05', 'epoch': '0.2889', 'num_input_tokens_seen': 23487278, 'train_runtime': '1.188e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6855', 'grad_norm': '0.945', 'learning_rate': '4.982e-05', 'epoch': '0.2889', 'num_input_tokens_seen': 23489325, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7067', 'grad_norm': '1.737', 'learning_rate': '4.982e-05', 'epoch': '0.2889', 'num_input_tokens_seen': 23491372, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4503', 'grad_norm': '1.061', 'learning_rate': '4.982e-05', 'epoch': '0.289', 'num_input_tokens_seen': 23493419, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5224', 'grad_norm': '1.053', 'learning_rate': '4.982e-05', 'epoch': '0.289', 'num_input_tokens_seen': 23495466, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5824', 'grad_norm': '1.165', 'learning_rate': '4.982e-05', 'epoch': '0.289', 'num_input_tokens_seen': 23497513, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.51', 'grad_norm': '3.183', 'learning_rate': '4.982e-05', 'epoch': '0.289', 'num_input_tokens_seen': 23499560, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2859', 'grad_norm': '1.053', 'learning_rate': '4.982e-05', 'epoch': '0.2891', 'num_input_tokens_seen': 23501607, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6654', 'grad_norm': '1.036', 'learning_rate': '4.982e-05', 'epoch': '0.2891', 'num_input_tokens_seen': 23503654, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.047', 'grad_norm': '2.124', 'learning_rate': '4.982e-05', 'epoch': '0.2891', 'num_input_tokens_seen': 23505701, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8185', 'grad_norm': '1.483', 'learning_rate': '4.982e-05', 'epoch': '0.2891', 'num_input_tokens_seen': 23507748, 'train_runtime': '1.189e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2305', 'grad_norm': '0.7838', 'learning_rate': '4.982e-05', 'epoch': '0.2892', 'num_input_tokens_seen': 23509795, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8174', 'grad_norm': '1.529', 'learning_rate': '4.982e-05', 'epoch': '0.2892', 'num_input_tokens_seen': 23511842, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3254', 'grad_norm': '0.7722', 'learning_rate': '4.982e-05', 'epoch': '0.2892', 'num_input_tokens_seen': 23513889, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5247', 'grad_norm': '0.7989', 'learning_rate': '4.982e-05', 'epoch': '0.2892', 'num_input_tokens_seen': 23515936, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.252', 'grad_norm': '1.747', 'learning_rate': '4.982e-05', 'epoch': '0.2893', 'num_input_tokens_seen': 23517983, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2422', 'grad_norm': '0.8071', 'learning_rate': '4.982e-05', 'epoch': '0.2893', 'num_input_tokens_seen': 23520030, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7119', 'grad_norm': '1.428', 'learning_rate': '4.982e-05', 'epoch': '0.2893', 'num_input_tokens_seen': 23522077, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2637', 'grad_norm': '0.811', 'learning_rate': '4.982e-05', 'epoch': '0.2893', 'num_input_tokens_seen': 23524124, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5885', 'grad_norm': '1.218', 'learning_rate': '4.982e-05', 'epoch': '0.2894', 'num_input_tokens_seen': 23526171, 'train_runtime': '1.19e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6126', 'grad_norm': '0.9052', 'learning_rate': '4.982e-05', 'epoch': '0.2894', 'num_input_tokens_seen': 23528218, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4892', 'grad_norm': '1.033', 'learning_rate': '4.982e-05', 'epoch': '0.2894', 'num_input_tokens_seen': 23530265, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8433', 'grad_norm': '1.588', 'learning_rate': '4.982e-05', 'epoch': '0.2894', 'num_input_tokens_seen': 23532312, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6069', 'grad_norm': '1.227', 'learning_rate': '4.982e-05', 'epoch': '0.2895', 'num_input_tokens_seen': 23534359, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.153', 'grad_norm': '1.889', 'learning_rate': '4.982e-05', 'epoch': '0.2895', 'num_input_tokens_seen': 23536406, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5595', 'grad_norm': '1.234', 'learning_rate': '4.982e-05', 'epoch': '0.2895', 'num_input_tokens_seen': 23538453, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9832', 'grad_norm': '2.325', 'learning_rate': '4.982e-05', 'epoch': '0.2895', 'num_input_tokens_seen': 23540500, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5903', 'grad_norm': '1.382', 'learning_rate': '4.982e-05', 'epoch': '0.2896', 'num_input_tokens_seen': 23542547, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7899', 'grad_norm': '1.703', 'learning_rate': '4.982e-05', 'epoch': '0.2896', 'num_input_tokens_seen': 23544594, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.332', 'grad_norm': '2.44', 'learning_rate': '4.982e-05', 'epoch': '0.2896', 'num_input_tokens_seen': 23546641, 'train_runtime': '1.191e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6296', 'grad_norm': '1.467', 'learning_rate': '4.982e-05', 'epoch': '0.2896', 'num_input_tokens_seen': 23548688, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4088', 'grad_norm': '0.9945', 'learning_rate': '4.982e-05', 'epoch': '0.2897', 'num_input_tokens_seen': 23550735, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9712', 'grad_norm': '2.043', 'learning_rate': '4.982e-05', 'epoch': '0.2897', 'num_input_tokens_seen': 23552782, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.032', 'grad_norm': '1.453', 'learning_rate': '4.982e-05', 'epoch': '0.2897', 'num_input_tokens_seen': 23554829, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.093', 'grad_norm': '1.559', 'learning_rate': '4.982e-05', 'epoch': '0.2897', 'num_input_tokens_seen': 23556876, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6063', 'grad_norm': '1.314', 'learning_rate': '4.982e-05', 'epoch': '0.2898', 'num_input_tokens_seen': 23558923, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.376', 'grad_norm': '1.981', 'learning_rate': '4.982e-05', 'epoch': '0.2898', 'num_input_tokens_seen': 23560970, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7771', 'grad_norm': '1.295', 'learning_rate': '4.982e-05', 'epoch': '0.2898', 'num_input_tokens_seen': 23563017, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8331', 'grad_norm': '1.865', 'learning_rate': '4.982e-05', 'epoch': '0.2899', 'num_input_tokens_seen': 23565064, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7399', 'grad_norm': '1.257', 'learning_rate': '4.982e-05', 'epoch': '0.2899', 'num_input_tokens_seen': 23567111, 'train_runtime': '1.192e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5455', 'grad_norm': '1.128', 'learning_rate': '4.981e-05', 'epoch': '0.2899', 'num_input_tokens_seen': 23569158, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.076', 'grad_norm': '2.031', 'learning_rate': '4.981e-05', 'epoch': '0.2899', 'num_input_tokens_seen': 23571205, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2663', 'grad_norm': '0.8338', 'learning_rate': '4.981e-05', 'epoch': '0.29', 'num_input_tokens_seen': 23573252, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5134', 'grad_norm': '1.081', 'learning_rate': '4.981e-05', 'epoch': '0.29', 'num_input_tokens_seen': 23575299, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.758', 'grad_norm': '2.704', 'learning_rate': '4.981e-05', 'epoch': '0.29', 'num_input_tokens_seen': 23577346, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.39', 'grad_norm': '2.507', 'learning_rate': '4.981e-05', 'epoch': '0.29', 'num_input_tokens_seen': 23579393, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.398', 'grad_norm': '2.488', 'learning_rate': '4.981e-05', 'epoch': '0.2901', 'num_input_tokens_seen': 23581440, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6256', 'grad_norm': '1.239', 'learning_rate': '4.981e-05', 'epoch': '0.2901', 'num_input_tokens_seen': 23583487, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7177', 'grad_norm': '1.549', 'learning_rate': '4.981e-05', 'epoch': '0.2901', 'num_input_tokens_seen': 23585534, 'train_runtime': '1.193e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4374', 'grad_norm': '0.9935', 'learning_rate': '4.981e-05', 'epoch': '0.2901', 'num_input_tokens_seen': 23587581, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7399', 'grad_norm': '1.568', 'learning_rate': '4.981e-05', 'epoch': '0.2902', 'num_input_tokens_seen': 23589628, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6645', 'grad_norm': '1.031', 'learning_rate': '4.981e-05', 'epoch': '0.2902', 'num_input_tokens_seen': 23591675, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2048', 'grad_norm': '0.7603', 'learning_rate': '4.981e-05', 'epoch': '0.2902', 'num_input_tokens_seen': 23593722, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7759', 'grad_norm': '1.37', 'learning_rate': '4.981e-05', 'epoch': '0.2902', 'num_input_tokens_seen': 23595769, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2898', 'grad_norm': '0.917', 'learning_rate': '4.981e-05', 'epoch': '0.2903', 'num_input_tokens_seen': 23597816, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4611', 'grad_norm': '1.098', 'learning_rate': '4.981e-05', 'epoch': '0.2903', 'num_input_tokens_seen': 23599863, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.032', 'grad_norm': '1.824', 'learning_rate': '4.981e-05', 'epoch': '0.2903', 'num_input_tokens_seen': 23601910, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5339', 'grad_norm': '1.097', 'learning_rate': '4.981e-05', 'epoch': '0.2903', 'num_input_tokens_seen': 23603957, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2007', 'grad_norm': '0.8781', 'learning_rate': '4.981e-05', 'epoch': '0.2904', 'num_input_tokens_seen': 23606004, 'train_runtime': '1.194e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.527', 'grad_norm': '2.759', 'learning_rate': '4.981e-05', 'epoch': '0.2904', 'num_input_tokens_seen': 23608051, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.043', 'grad_norm': '1.473', 'learning_rate': '4.981e-05', 'epoch': '0.2904', 'num_input_tokens_seen': 23610098, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7062', 'grad_norm': '1.584', 'learning_rate': '4.981e-05', 'epoch': '0.2904', 'num_input_tokens_seen': 23612145, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.041', 'grad_norm': '1.827', 'learning_rate': '4.981e-05', 'epoch': '0.2905', 'num_input_tokens_seen': 23614192, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1879', 'grad_norm': '0.6991', 'learning_rate': '4.981e-05', 'epoch': '0.2905', 'num_input_tokens_seen': 23616239, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5997', 'grad_norm': '1.486', 'learning_rate': '4.981e-05', 'epoch': '0.2905', 'num_input_tokens_seen': 23618286, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7645', 'grad_norm': '1.104', 'learning_rate': '4.981e-05', 'epoch': '0.2905', 'num_input_tokens_seen': 23620333, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5174', 'grad_norm': '1.446', 'learning_rate': '4.981e-05', 'epoch': '0.2906', 'num_input_tokens_seen': 23622380, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2373', 'grad_norm': '0.9124', 'learning_rate': '4.981e-05', 'epoch': '0.2906', 'num_input_tokens_seen': 23624427, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5087', 'grad_norm': '1.307', 'learning_rate': '4.981e-05', 'epoch': '0.2906', 'num_input_tokens_seen': 23626474, 'train_runtime': '1.195e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.051', 'grad_norm': '2.107', 'learning_rate': '4.981e-05', 'epoch': '0.2906', 'num_input_tokens_seen': 23628521, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6488', 'grad_norm': '1.501', 'learning_rate': '4.981e-05', 'epoch': '0.2907', 'num_input_tokens_seen': 23630568, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8337', 'grad_norm': '1.281', 'learning_rate': '4.981e-05', 'epoch': '0.2907', 'num_input_tokens_seen': 23632615, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.167', 'grad_norm': '2.073', 'learning_rate': '4.981e-05', 'epoch': '0.2907', 'num_input_tokens_seen': 23634662, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3599', 'grad_norm': '0.7646', 'learning_rate': '4.981e-05', 'epoch': '0.2907', 'num_input_tokens_seen': 23636709, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.164', 'grad_norm': '1.894', 'learning_rate': '4.981e-05', 'epoch': '0.2908', 'num_input_tokens_seen': 23638756, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2257', 'grad_norm': '0.8831', 'learning_rate': '4.981e-05', 'epoch': '0.2908', 'num_input_tokens_seen': 23640803, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2507', 'grad_norm': '0.9207', 'learning_rate': '4.981e-05', 'epoch': '0.2908', 'num_input_tokens_seen': 23642850, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9597', 'grad_norm': '1.724', 'learning_rate': '4.981e-05', 'epoch': '0.2908', 'num_input_tokens_seen': 23644897, 'train_runtime': '1.196e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4185', 'grad_norm': '0.9978', 'learning_rate': '4.981e-05', 'epoch': '0.2909', 'num_input_tokens_seen': 23646944, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5913', 'grad_norm': '1.357', 'learning_rate': '4.981e-05', 'epoch': '0.2909', 'num_input_tokens_seen': 23648991, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9244', 'grad_norm': '1.629', 'learning_rate': '4.981e-05', 'epoch': '0.2909', 'num_input_tokens_seen': 23651038, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2255', 'grad_norm': '0.8516', 'learning_rate': '4.981e-05', 'epoch': '0.2909', 'num_input_tokens_seen': 23653085, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2136', 'grad_norm': '0.7605', 'learning_rate': '4.981e-05', 'epoch': '0.291', 'num_input_tokens_seen': 23655132, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2433', 'grad_norm': '0.8448', 'learning_rate': '4.981e-05', 'epoch': '0.291', 'num_input_tokens_seen': 23657179, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.457', 'grad_norm': '2.426', 'learning_rate': '4.981e-05', 'epoch': '0.291', 'num_input_tokens_seen': 23659226, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8621', 'grad_norm': '1.429', 'learning_rate': '4.981e-05', 'epoch': '0.291', 'num_input_tokens_seen': 23661273, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.479', 'grad_norm': '2.363', 'learning_rate': '4.981e-05', 'epoch': '0.2911', 'num_input_tokens_seen': 23663320, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3754', 'grad_norm': '0.9871', 'learning_rate': '4.981e-05', 'epoch': '0.2911', 'num_input_tokens_seen': 23665367, 'train_runtime': '1.197e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6251', 'grad_norm': '1.406', 'learning_rate': '4.981e-05', 'epoch': '0.2911', 'num_input_tokens_seen': 23667414, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.132', 'grad_norm': '2.248', 'learning_rate': '4.981e-05', 'epoch': '0.2911', 'num_input_tokens_seen': 23669461, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4561', 'grad_norm': '1.304', 'learning_rate': '4.981e-05', 'epoch': '0.2912', 'num_input_tokens_seen': 23671508, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6082', 'grad_norm': '1.382', 'learning_rate': '4.981e-05', 'epoch': '0.2912', 'num_input_tokens_seen': 23673555, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6055', 'grad_norm': '1.075', 'learning_rate': '4.981e-05', 'epoch': '0.2912', 'num_input_tokens_seen': 23675602, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9421', 'grad_norm': '1.238', 'learning_rate': '4.981e-05', 'epoch': '0.2912', 'num_input_tokens_seen': 23677649, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7671', 'grad_norm': '1.403', 'learning_rate': '4.981e-05', 'epoch': '0.2913', 'num_input_tokens_seen': 23679696, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5396', 'grad_norm': '1.203', 'learning_rate': '4.981e-05', 'epoch': '0.2913', 'num_input_tokens_seen': 23681743, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7222', 'grad_norm': '1.238', 'learning_rate': '4.981e-05', 'epoch': '0.2913', 'num_input_tokens_seen': 23683790, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.337', 'grad_norm': '2.286', 'learning_rate': '4.981e-05', 'epoch': '0.2913', 'num_input_tokens_seen': 23685837, 'train_runtime': '1.198e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3998', 'grad_norm': '0.9953', 'learning_rate': '4.981e-05', 'epoch': '0.2914', 'num_input_tokens_seen': 23687884, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9767', 'grad_norm': '1.918', 'learning_rate': '4.981e-05', 'epoch': '0.2914', 'num_input_tokens_seen': 23689931, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6926', 'grad_norm': '1.357', 'learning_rate': '4.981e-05', 'epoch': '0.2914', 'num_input_tokens_seen': 23691978, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.296', 'grad_norm': '0.8238', 'learning_rate': '4.981e-05', 'epoch': '0.2914', 'num_input_tokens_seen': 23694025, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.841', 'grad_norm': '1.273', 'learning_rate': '4.981e-05', 'epoch': '0.2915', 'num_input_tokens_seen': 23696072, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8401', 'grad_norm': '1.372', 'learning_rate': '4.981e-05', 'epoch': '0.2915', 'num_input_tokens_seen': 23698119, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8458', 'grad_norm': '1.84', 'learning_rate': '4.981e-05', 'epoch': '0.2915', 'num_input_tokens_seen': 23700166, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3484', 'grad_norm': '0.7882', 'learning_rate': '4.981e-05', 'epoch': '0.2915', 'num_input_tokens_seen': 23702213, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2197', 'grad_norm': '0.9041', 'learning_rate': '4.981e-05', 'epoch': '0.2916', 'num_input_tokens_seen': 23704260, 'train_runtime': '1.199e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.118', 'grad_norm': '1.646', 'learning_rate': '4.981e-05', 'epoch': '0.2916', 'num_input_tokens_seen': 23706307, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8738', 'grad_norm': '1.357', 'learning_rate': '4.981e-05', 'epoch': '0.2916', 'num_input_tokens_seen': 23708354, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.242', 'grad_norm': '2.358', 'learning_rate': '4.981e-05', 'epoch': '0.2916', 'num_input_tokens_seen': 23710401, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9704', 'grad_norm': '1.942', 'learning_rate': '4.981e-05', 'epoch': '0.2917', 'num_input_tokens_seen': 23712448, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.062', 'grad_norm': '1.797', 'learning_rate': '4.981e-05', 'epoch': '0.2917', 'num_input_tokens_seen': 23714495, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6926', 'grad_norm': '0.9186', 'learning_rate': '4.981e-05', 'epoch': '0.2917', 'num_input_tokens_seen': 23716542, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4354', 'grad_norm': '0.9828', 'learning_rate': '4.981e-05', 'epoch': '0.2917', 'num_input_tokens_seen': 23718589, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2356', 'grad_norm': '0.8555', 'learning_rate': '4.981e-05', 'epoch': '0.2918', 'num_input_tokens_seen': 23720636, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3435', 'grad_norm': '0.9323', 'learning_rate': '4.981e-05', 'epoch': '0.2918', 'num_input_tokens_seen': 23722683, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5321', 'grad_norm': '1.25', 'learning_rate': '4.981e-05', 'epoch': '0.2918', 'num_input_tokens_seen': 23724730, 'train_runtime': '1.2e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2718', 'grad_norm': '0.7401', 'learning_rate': '4.981e-05', 'epoch': '0.2918', 'num_input_tokens_seen': 23726777, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4744', 'grad_norm': '1.277', 'learning_rate': '4.981e-05', 'epoch': '0.2919', 'num_input_tokens_seen': 23728824, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8549', 'grad_norm': '1.896', 'learning_rate': '4.981e-05', 'epoch': '0.2919', 'num_input_tokens_seen': 23730871, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9501', 'grad_norm': '1.716', 'learning_rate': '4.981e-05', 'epoch': '0.2919', 'num_input_tokens_seen': 23732918, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.654', 'grad_norm': '1.104', 'learning_rate': '4.981e-05', 'epoch': '0.2919', 'num_input_tokens_seen': 23734965, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3733', 'grad_norm': '0.8437', 'learning_rate': '4.981e-05', 'epoch': '0.292', 'num_input_tokens_seen': 23737012, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3128', 'grad_norm': '0.8467', 'learning_rate': '4.981e-05', 'epoch': '0.292', 'num_input_tokens_seen': 23739059, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8108', 'grad_norm': '1.467', 'learning_rate': '4.981e-05', 'epoch': '0.292', 'num_input_tokens_seen': 23741106, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7588', 'grad_norm': '1.409', 'learning_rate': '4.981e-05', 'epoch': '0.292', 'num_input_tokens_seen': 23743153, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3391', 'grad_norm': '0.9169', 'learning_rate': '4.981e-05', 'epoch': '0.2921', 'num_input_tokens_seen': 23745200, 'train_runtime': '1.201e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6543', 'grad_norm': '1.278', 'learning_rate': '4.981e-05', 'epoch': '0.2921', 'num_input_tokens_seen': 23747247, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5182', 'grad_norm': '1.288', 'learning_rate': '4.981e-05', 'epoch': '0.2921', 'num_input_tokens_seen': 23749294, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8239', 'grad_norm': '1.35', 'learning_rate': '4.981e-05', 'epoch': '0.2921', 'num_input_tokens_seen': 23751341, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8441', 'grad_norm': '1.458', 'learning_rate': '4.981e-05', 'epoch': '0.2922', 'num_input_tokens_seen': 23753388, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.308', 'grad_norm': '0.9719', 'learning_rate': '4.981e-05', 'epoch': '0.2922', 'num_input_tokens_seen': 23755435, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8468', 'grad_norm': '1.211', 'learning_rate': '4.981e-05', 'epoch': '0.2922', 'num_input_tokens_seen': 23757482, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '1.807', 'learning_rate': '4.981e-05', 'epoch': '0.2922', 'num_input_tokens_seen': 23759529, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7872', 'grad_norm': '1.042', 'learning_rate': '4.981e-05', 'epoch': '0.2923', 'num_input_tokens_seen': 23761576, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3711', 'grad_norm': '0.9945', 'learning_rate': '4.981e-05', 'epoch': '0.2923', 'num_input_tokens_seen': 23763623, 'train_runtime': '1.202e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2322', 'grad_norm': '0.7761', 'learning_rate': '4.981e-05', 'epoch': '0.2923', 'num_input_tokens_seen': 23765670, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4284', 'grad_norm': '0.9539', 'learning_rate': '4.981e-05', 'epoch': '0.2923', 'num_input_tokens_seen': 23767717, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6374', 'grad_norm': '1.275', 'learning_rate': '4.981e-05', 'epoch': '0.2924', 'num_input_tokens_seen': 23769764, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3449', 'grad_norm': '1.022', 'learning_rate': '4.981e-05', 'epoch': '0.2924', 'num_input_tokens_seen': 23771811, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7476', 'grad_norm': '1.048', 'learning_rate': '4.981e-05', 'epoch': '0.2924', 'num_input_tokens_seen': 23773858, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.241', 'grad_norm': '2.092', 'learning_rate': '4.981e-05', 'epoch': '0.2924', 'num_input_tokens_seen': 23775905, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4969', 'grad_norm': '1.069', 'learning_rate': '4.981e-05', 'epoch': '0.2925', 'num_input_tokens_seen': 23777952, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.08', 'grad_norm': '1.882', 'learning_rate': '4.981e-05', 'epoch': '0.2925', 'num_input_tokens_seen': 23779999, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6415', 'grad_norm': '1.284', 'learning_rate': '4.981e-05', 'epoch': '0.2925', 'num_input_tokens_seen': 23782046, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.416', 'grad_norm': '2.298', 'learning_rate': '4.981e-05', 'epoch': '0.2925', 'num_input_tokens_seen': 23784093, 'train_runtime': '1.203e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3939', 'grad_norm': '0.9534', 'learning_rate': '4.981e-05', 'epoch': '0.2926', 'num_input_tokens_seen': 23786140, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3573', 'grad_norm': '1.015', 'learning_rate': '4.981e-05', 'epoch': '0.2926', 'num_input_tokens_seen': 23788187, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3304', 'grad_norm': '0.8601', 'learning_rate': '4.981e-05', 'epoch': '0.2926', 'num_input_tokens_seen': 23790234, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6251', 'grad_norm': '1.062', 'learning_rate': '4.981e-05', 'epoch': '0.2926', 'num_input_tokens_seen': 23792281, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.038', 'grad_norm': '2.617', 'learning_rate': '4.981e-05', 'epoch': '0.2927', 'num_input_tokens_seen': 23794328, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7317', 'grad_norm': '1.709', 'learning_rate': '4.981e-05', 'epoch': '0.2927', 'num_input_tokens_seen': 23796375, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.965', 'grad_norm': '2.722', 'learning_rate': '4.981e-05', 'epoch': '0.2927', 'num_input_tokens_seen': 23798422, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6488', 'grad_norm': '0.985', 'learning_rate': '4.981e-05', 'epoch': '0.2927', 'num_input_tokens_seen': 23800469, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5315', 'grad_norm': '1', 'learning_rate': '4.981e-05', 'epoch': '0.2928', 'num_input_tokens_seen': 23802516, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5289', 'grad_norm': '1.209', 'learning_rate': '4.981e-05', 'epoch': '0.2928', 'num_input_tokens_seen': 23804563, 'train_runtime': '1.204e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2661', 'grad_norm': '0.8762', 'learning_rate': '4.981e-05', 'epoch': '0.2928', 'num_input_tokens_seen': 23806610, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.322', 'grad_norm': '0.7567', 'learning_rate': '4.981e-05', 'epoch': '0.2928', 'num_input_tokens_seen': 23808657, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3202', 'grad_norm': '0.9432', 'learning_rate': '4.981e-05', 'epoch': '0.2929', 'num_input_tokens_seen': 23810704, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.005', 'grad_norm': '3.098', 'learning_rate': '4.981e-05', 'epoch': '0.2929', 'num_input_tokens_seen': 23812751, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2972', 'grad_norm': '0.9655', 'learning_rate': '4.981e-05', 'epoch': '0.2929', 'num_input_tokens_seen': 23814798, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5577', 'grad_norm': '1.35', 'learning_rate': '4.981e-05', 'epoch': '0.2929', 'num_input_tokens_seen': 23816845, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2863', 'grad_norm': '0.8864', 'learning_rate': '4.981e-05', 'epoch': '0.293', 'num_input_tokens_seen': 23818892, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9524', 'grad_norm': '1.352', 'learning_rate': '4.981e-05', 'epoch': '0.293', 'num_input_tokens_seen': 23820939, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4608', 'grad_norm': '1.004', 'learning_rate': '4.981e-05', 'epoch': '0.293', 'num_input_tokens_seen': 23822986, 'train_runtime': '1.205e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8167', 'grad_norm': '1.747', 'learning_rate': '4.981e-05', 'epoch': '0.293', 'num_input_tokens_seen': 23825033, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8539', 'grad_norm': '1.191', 'learning_rate': '4.981e-05', 'epoch': '0.2931', 'num_input_tokens_seen': 23827080, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.444', 'grad_norm': '2.241', 'learning_rate': '4.981e-05', 'epoch': '0.2931', 'num_input_tokens_seen': 23829127, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5079', 'grad_norm': '1.066', 'learning_rate': '4.981e-05', 'epoch': '0.2931', 'num_input_tokens_seen': 23831174, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.782', 'grad_norm': '3.173', 'learning_rate': '4.981e-05', 'epoch': '0.2931', 'num_input_tokens_seen': 23833221, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4484', 'grad_norm': '1.215', 'learning_rate': '4.981e-05', 'epoch': '0.2932', 'num_input_tokens_seen': 23835268, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5504', 'grad_norm': '1.137', 'learning_rate': '4.981e-05', 'epoch': '0.2932', 'num_input_tokens_seen': 23837315, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2407', 'grad_norm': '0.7973', 'learning_rate': '4.981e-05', 'epoch': '0.2932', 'num_input_tokens_seen': 23839362, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.446', 'grad_norm': '2.463', 'learning_rate': '4.981e-05', 'epoch': '0.2932', 'num_input_tokens_seen': 23841409, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5174', 'grad_norm': '1.089', 'learning_rate': '4.981e-05', 'epoch': '0.2933', 'num_input_tokens_seen': 23843456, 'train_runtime': '1.206e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3383', 'grad_norm': '0.9076', 'learning_rate': '4.981e-05', 'epoch': '0.2933', 'num_input_tokens_seen': 23845503, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8671', 'grad_norm': '1.329', 'learning_rate': '4.981e-05', 'epoch': '0.2933', 'num_input_tokens_seen': 23847550, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7749', 'grad_norm': '1.024', 'learning_rate': '4.981e-05', 'epoch': '0.2934', 'num_input_tokens_seen': 23849597, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8343', 'grad_norm': '1.568', 'learning_rate': '4.981e-05', 'epoch': '0.2934', 'num_input_tokens_seen': 23851644, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9462', 'grad_norm': '1.453', 'learning_rate': '4.981e-05', 'epoch': '0.2934', 'num_input_tokens_seen': 23853691, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4324', 'grad_norm': '1.196', 'learning_rate': '4.981e-05', 'epoch': '0.2934', 'num_input_tokens_seen': 23855738, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6037', 'grad_norm': '1.409', 'learning_rate': '4.981e-05', 'epoch': '0.2935', 'num_input_tokens_seen': 23857785, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6231', 'grad_norm': '1.262', 'learning_rate': '4.981e-05', 'epoch': '0.2935', 'num_input_tokens_seen': 23859832, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4121', 'grad_norm': '0.8168', 'learning_rate': '4.981e-05', 'epoch': '0.2935', 'num_input_tokens_seen': 23861879, 'train_runtime': '1.207e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.568', 'grad_norm': '1.367', 'learning_rate': '4.981e-05', 'epoch': '0.2935', 'num_input_tokens_seen': 23863926, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8662', 'grad_norm': '1.554', 'learning_rate': '4.981e-05', 'epoch': '0.2936', 'num_input_tokens_seen': 23865973, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7244', 'grad_norm': '1.275', 'learning_rate': '4.981e-05', 'epoch': '0.2936', 'num_input_tokens_seen': 23868020, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.373', 'grad_norm': '1.95', 'learning_rate': '4.981e-05', 'epoch': '0.2936', 'num_input_tokens_seen': 23870067, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.001', 'grad_norm': '1.487', 'learning_rate': '4.981e-05', 'epoch': '0.2936', 'num_input_tokens_seen': 23872114, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6103', 'grad_norm': '0.9756', 'learning_rate': '4.981e-05', 'epoch': '0.2937', 'num_input_tokens_seen': 23874161, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.913', 'grad_norm': '1.867', 'learning_rate': '4.981e-05', 'epoch': '0.2937', 'num_input_tokens_seen': 23876208, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6604', 'grad_norm': '1.721', 'learning_rate': '4.981e-05', 'epoch': '0.2937', 'num_input_tokens_seen': 23878255, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.171', 'grad_norm': '2.141', 'learning_rate': '4.981e-05', 'epoch': '0.2937', 'num_input_tokens_seen': 23880302, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5133', 'grad_norm': '1.112', 'learning_rate': '4.981e-05', 'epoch': '0.2938', 'num_input_tokens_seen': 23882349, 'train_runtime': '1.208e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.061', 'grad_norm': '1.827', 'learning_rate': '4.981e-05', 'epoch': '0.2938', 'num_input_tokens_seen': 23884396, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.162', 'grad_norm': '1.897', 'learning_rate': '4.981e-05', 'epoch': '0.2938', 'num_input_tokens_seen': 23886443, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.79', 'grad_norm': '2.249', 'learning_rate': '4.981e-05', 'epoch': '0.2938', 'num_input_tokens_seen': 23888490, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1988', 'grad_norm': '0.7699', 'learning_rate': '4.981e-05', 'epoch': '0.2939', 'num_input_tokens_seen': 23890537, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.05', 'grad_norm': '2.173', 'learning_rate': '4.981e-05', 'epoch': '0.2939', 'num_input_tokens_seen': 23892584, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6301', 'grad_norm': '1.131', 'learning_rate': '4.981e-05', 'epoch': '0.2939', 'num_input_tokens_seen': 23894631, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9816', 'grad_norm': '1.558', 'learning_rate': '4.981e-05', 'epoch': '0.2939', 'num_input_tokens_seen': 23896678, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7065', 'grad_norm': '0.9311', 'learning_rate': '4.981e-05', 'epoch': '0.294', 'num_input_tokens_seen': 23898725, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4627', 'grad_norm': '1.08', 'learning_rate': '4.981e-05', 'epoch': '0.294', 'num_input_tokens_seen': 23900772, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4238', 'grad_norm': '1.18', 'learning_rate': '4.981e-05', 'epoch': '0.294', 'num_input_tokens_seen': 23902819, 'train_runtime': '1.209e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7147', 'grad_norm': '1.056', 'learning_rate': '4.981e-05', 'epoch': '0.294', 'num_input_tokens_seen': 23904866, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3864', 'grad_norm': '1.038', 'learning_rate': '4.981e-05', 'epoch': '0.2941', 'num_input_tokens_seen': 23906913, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9012', 'grad_norm': '1.361', 'learning_rate': '4.981e-05', 'epoch': '0.2941', 'num_input_tokens_seen': 23908960, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9574', 'grad_norm': '1.663', 'learning_rate': '4.981e-05', 'epoch': '0.2941', 'num_input_tokens_seen': 23911007, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3324', 'grad_norm': '0.9107', 'learning_rate': '4.981e-05', 'epoch': '0.2941', 'num_input_tokens_seen': 23913054, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6934', 'grad_norm': '1.51', 'learning_rate': '4.981e-05', 'epoch': '0.2942', 'num_input_tokens_seen': 23915101, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7944', 'grad_norm': '1.263', 'learning_rate': '4.981e-05', 'epoch': '0.2942', 'num_input_tokens_seen': 23917148, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.921', 'grad_norm': '1.746', 'learning_rate': '4.981e-05', 'epoch': '0.2942', 'num_input_tokens_seen': 23919195, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.542', 'grad_norm': '4.358', 'learning_rate': '4.981e-05', 'epoch': '0.2942', 'num_input_tokens_seen': 23921242, 'train_runtime': '1.21e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4426', 'grad_norm': '0.9918', 'learning_rate': '4.981e-05', 'epoch': '0.2943', 'num_input_tokens_seen': 23923289, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.101', 'grad_norm': '1.492', 'learning_rate': '4.981e-05', 'epoch': '0.2943', 'num_input_tokens_seen': 23925336, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5879', 'grad_norm': '1.393', 'learning_rate': '4.981e-05', 'epoch': '0.2943', 'num_input_tokens_seen': 23927383, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5708', 'grad_norm': '1.117', 'learning_rate': '4.981e-05', 'epoch': '0.2943', 'num_input_tokens_seen': 23929430, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2116', 'grad_norm': '0.8209', 'learning_rate': '4.981e-05', 'epoch': '0.2944', 'num_input_tokens_seen': 23931477, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3667', 'grad_norm': '1.033', 'learning_rate': '4.981e-05', 'epoch': '0.2944', 'num_input_tokens_seen': 23933524, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2945', 'grad_norm': '0.9561', 'learning_rate': '4.981e-05', 'epoch': '0.2944', 'num_input_tokens_seen': 23935571, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4726', 'grad_norm': '1.12', 'learning_rate': '4.981e-05', 'epoch': '0.2944', 'num_input_tokens_seen': 23937618, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5565', 'grad_norm': '1.092', 'learning_rate': '4.981e-05', 'epoch': '0.2945', 'num_input_tokens_seen': 23939665, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4153', 'grad_norm': '1.071', 'learning_rate': '4.981e-05', 'epoch': '0.2945', 'num_input_tokens_seen': 23941712, 'train_runtime': '1.211e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.477', 'grad_norm': '0.8852', 'learning_rate': '4.981e-05', 'epoch': '0.2945', 'num_input_tokens_seen': 23943759, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5099', 'grad_norm': '1.224', 'learning_rate': '4.981e-05', 'epoch': '0.2945', 'num_input_tokens_seen': 23945806, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8167', 'grad_norm': '1.368', 'learning_rate': '4.981e-05', 'epoch': '0.2946', 'num_input_tokens_seen': 23947853, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3761', 'grad_norm': '0.9418', 'learning_rate': '4.981e-05', 'epoch': '0.2946', 'num_input_tokens_seen': 23949900, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.264', 'grad_norm': '2.262', 'learning_rate': '4.981e-05', 'epoch': '0.2946', 'num_input_tokens_seen': 23951947, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.431', 'grad_norm': '1.072', 'learning_rate': '4.981e-05', 'epoch': '0.2946', 'num_input_tokens_seen': 23953994, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5221', 'grad_norm': '1.146', 'learning_rate': '4.981e-05', 'epoch': '0.2947', 'num_input_tokens_seen': 23956041, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4345', 'grad_norm': '1.089', 'learning_rate': '4.981e-05', 'epoch': '0.2947', 'num_input_tokens_seen': 23958088, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.068', 'grad_norm': '1.389', 'learning_rate': '4.981e-05', 'epoch': '0.2947', 'num_input_tokens_seen': 23960135, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4549', 'grad_norm': '1.209', 'learning_rate': '4.981e-05', 'epoch': '0.2947', 'num_input_tokens_seen': 23962182, 'train_runtime': '1.212e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5299', 'grad_norm': '0.9798', 'learning_rate': '4.981e-05', 'epoch': '0.2948', 'num_input_tokens_seen': 23964229, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2637', 'grad_norm': '0.8552', 'learning_rate': '4.981e-05', 'epoch': '0.2948', 'num_input_tokens_seen': 23966276, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.654', 'grad_norm': '3.695', 'learning_rate': '4.981e-05', 'epoch': '0.2948', 'num_input_tokens_seen': 23968323, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7427', 'grad_norm': '1.275', 'learning_rate': '4.981e-05', 'epoch': '0.2948', 'num_input_tokens_seen': 23970370, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9506', 'grad_norm': '1.12', 'learning_rate': '4.981e-05', 'epoch': '0.2949', 'num_input_tokens_seen': 23972417, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7142', 'grad_norm': '1.172', 'learning_rate': '4.981e-05', 'epoch': '0.2949', 'num_input_tokens_seen': 23974464, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.288', 'grad_norm': '2.152', 'learning_rate': '4.981e-05', 'epoch': '0.2949', 'num_input_tokens_seen': 23976511, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5777', 'grad_norm': '1.313', 'learning_rate': '4.981e-05', 'epoch': '0.2949', 'num_input_tokens_seen': 23978558, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4958', 'grad_norm': '1.316', 'learning_rate': '4.981e-05', 'epoch': '0.295', 'num_input_tokens_seen': 23980605, 'train_runtime': '1.213e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.144', 'grad_norm': '1.614', 'learning_rate': '4.98e-05', 'epoch': '0.295', 'num_input_tokens_seen': 23982652, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.193', 'grad_norm': '2.227', 'learning_rate': '4.98e-05', 'epoch': '0.295', 'num_input_tokens_seen': 23984699, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.239', 'grad_norm': '2.072', 'learning_rate': '4.98e-05', 'epoch': '0.295', 'num_input_tokens_seen': 23986746, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8774', 'grad_norm': '1.379', 'learning_rate': '4.98e-05', 'epoch': '0.2951', 'num_input_tokens_seen': 23988793, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2346', 'grad_norm': '0.9862', 'learning_rate': '4.98e-05', 'epoch': '0.2951', 'num_input_tokens_seen': 23990840, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.387', 'grad_norm': '1.224', 'learning_rate': '4.98e-05', 'epoch': '0.2951', 'num_input_tokens_seen': 23992887, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.093', 'grad_norm': '1.919', 'learning_rate': '4.98e-05', 'epoch': '0.2951', 'num_input_tokens_seen': 23994934, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7481', 'grad_norm': '1.112', 'learning_rate': '4.98e-05', 'epoch': '0.2952', 'num_input_tokens_seen': 23996981, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9516', 'grad_norm': '1.816', 'learning_rate': '4.98e-05', 'epoch': '0.2952', 'num_input_tokens_seen': 23999028, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8969', 'grad_norm': '2.138', 'learning_rate': '4.98e-05', 'epoch': '0.2952', 'num_input_tokens_seen': 24001075, 'train_runtime': '1.214e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.431', 'grad_norm': '1.079', 'learning_rate': '4.98e-05', 'epoch': '0.2952', 'num_input_tokens_seen': 24003122, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7854', 'grad_norm': '1.478', 'learning_rate': '4.98e-05', 'epoch': '0.2953', 'num_input_tokens_seen': 24005169, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3565', 'grad_norm': '1.063', 'learning_rate': '4.98e-05', 'epoch': '0.2953', 'num_input_tokens_seen': 24007216, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7215', 'grad_norm': '1.631', 'learning_rate': '4.98e-05', 'epoch': '0.2953', 'num_input_tokens_seen': 24009263, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5827', 'grad_norm': '1.255', 'learning_rate': '4.98e-05', 'epoch': '0.2953', 'num_input_tokens_seen': 24011310, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4298', 'grad_norm': '1.103', 'learning_rate': '4.98e-05', 'epoch': '0.2954', 'num_input_tokens_seen': 24013357, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2678', 'grad_norm': '0.7637', 'learning_rate': '4.98e-05', 'epoch': '0.2954', 'num_input_tokens_seen': 24015404, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4467', 'grad_norm': '1.116', 'learning_rate': '4.98e-05', 'epoch': '0.2954', 'num_input_tokens_seen': 24017451, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.241', 'grad_norm': '2.178', 'learning_rate': '4.98e-05', 'epoch': '0.2954', 'num_input_tokens_seen': 24019498, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3502', 'grad_norm': '0.975', 'learning_rate': '4.98e-05', 'epoch': '0.2955', 'num_input_tokens_seen': 24021545, 'train_runtime': '1.215e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8469', 'grad_norm': '1.18', 'learning_rate': '4.98e-05', 'epoch': '0.2955', 'num_input_tokens_seen': 24023592, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2122', 'grad_norm': '0.8751', 'learning_rate': '4.98e-05', 'epoch': '0.2955', 'num_input_tokens_seen': 24025639, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4377', 'grad_norm': '1.441', 'learning_rate': '4.98e-05', 'epoch': '0.2955', 'num_input_tokens_seen': 24027686, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7584', 'grad_norm': '1.413', 'learning_rate': '4.98e-05', 'epoch': '0.2956', 'num_input_tokens_seen': 24029733, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9081', 'grad_norm': '1.71', 'learning_rate': '4.98e-05', 'epoch': '0.2956', 'num_input_tokens_seen': 24031780, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1948', 'grad_norm': '0.801', 'learning_rate': '4.98e-05', 'epoch': '0.2956', 'num_input_tokens_seen': 24033827, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4221', 'grad_norm': '1.079', 'learning_rate': '4.98e-05', 'epoch': '0.2956', 'num_input_tokens_seen': 24035874, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6534', 'grad_norm': '1.435', 'learning_rate': '4.98e-05', 'epoch': '0.2957', 'num_input_tokens_seen': 24037921, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9709', 'grad_norm': '1.967', 'learning_rate': '4.98e-05', 'epoch': '0.2957', 'num_input_tokens_seen': 24039968, 'train_runtime': '1.216e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2484', 'grad_norm': '0.7995', 'learning_rate': '4.98e-05', 'epoch': '0.2957', 'num_input_tokens_seen': 24042015, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2929', 'grad_norm': '0.9197', 'learning_rate': '4.98e-05', 'epoch': '0.2957', 'num_input_tokens_seen': 24044062, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3589', 'grad_norm': '0.9431', 'learning_rate': '4.98e-05', 'epoch': '0.2958', 'num_input_tokens_seen': 24046109, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4633', 'grad_norm': '1.41', 'learning_rate': '4.98e-05', 'epoch': '0.2958', 'num_input_tokens_seen': 24048156, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.457', 'grad_norm': '2.36', 'learning_rate': '4.98e-05', 'epoch': '0.2958', 'num_input_tokens_seen': 24050203, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.286', 'grad_norm': '0.9815', 'learning_rate': '4.98e-05', 'epoch': '0.2958', 'num_input_tokens_seen': 24052250, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8879', 'grad_norm': '1.584', 'learning_rate': '4.98e-05', 'epoch': '0.2959', 'num_input_tokens_seen': 24054297, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1546', 'grad_norm': '0.7639', 'learning_rate': '4.98e-05', 'epoch': '0.2959', 'num_input_tokens_seen': 24056344, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2732', 'grad_norm': '0.9568', 'learning_rate': '4.98e-05', 'epoch': '0.2959', 'num_input_tokens_seen': 24058391, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4485', 'grad_norm': '1.173', 'learning_rate': '4.98e-05', 'epoch': '0.2959', 'num_input_tokens_seen': 24060438, 'train_runtime': '1.217e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3599', 'grad_norm': '0.902', 'learning_rate': '4.98e-05', 'epoch': '0.296', 'num_input_tokens_seen': 24062485, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2947', 'grad_norm': '0.9633', 'learning_rate': '4.98e-05', 'epoch': '0.296', 'num_input_tokens_seen': 24064532, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.415', 'grad_norm': '2.143', 'learning_rate': '4.98e-05', 'epoch': '0.296', 'num_input_tokens_seen': 24066579, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7723', 'grad_norm': '1.169', 'learning_rate': '4.98e-05', 'epoch': '0.296', 'num_input_tokens_seen': 24068626, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.998', 'grad_norm': '2.587', 'learning_rate': '4.98e-05', 'epoch': '0.2961', 'num_input_tokens_seen': 24070673, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.043', 'grad_norm': '1.014', 'learning_rate': '4.98e-05', 'epoch': '0.2961', 'num_input_tokens_seen': 24072720, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.079', 'grad_norm': '1.823', 'learning_rate': '4.98e-05', 'epoch': '0.2961', 'num_input_tokens_seen': 24074767, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7379', 'grad_norm': '1.848', 'learning_rate': '4.98e-05', 'epoch': '0.2961', 'num_input_tokens_seen': 24076814, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.45', 'grad_norm': '2.668', 'learning_rate': '4.98e-05', 'epoch': '0.2962', 'num_input_tokens_seen': 24078861, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.11', 'grad_norm': '1.32', 'learning_rate': '4.98e-05', 'epoch': '0.2962', 'num_input_tokens_seen': 24080908, 'train_runtime': '1.218e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.883', 'grad_norm': '2.368', 'learning_rate': '4.98e-05', 'epoch': '0.2962', 'num_input_tokens_seen': 24082955, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4781', 'grad_norm': '1.03', 'learning_rate': '4.98e-05', 'epoch': '0.2962', 'num_input_tokens_seen': 24085002, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6589', 'grad_norm': '1.193', 'learning_rate': '4.98e-05', 'epoch': '0.2963', 'num_input_tokens_seen': 24087049, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9465', 'grad_norm': '1.464', 'learning_rate': '4.98e-05', 'epoch': '0.2963', 'num_input_tokens_seen': 24089096, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9655', 'grad_norm': '1.551', 'learning_rate': '4.98e-05', 'epoch': '0.2963', 'num_input_tokens_seen': 24091143, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3669', 'grad_norm': '0.792', 'learning_rate': '4.98e-05', 'epoch': '0.2963', 'num_input_tokens_seen': 24093190, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9916', 'grad_norm': '2.119', 'learning_rate': '4.98e-05', 'epoch': '0.2964', 'num_input_tokens_seen': 24095237, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4292', 'grad_norm': '0.7781', 'learning_rate': '4.98e-05', 'epoch': '0.2964', 'num_input_tokens_seen': 24097284, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8251', 'grad_norm': '1.636', 'learning_rate': '4.98e-05', 'epoch': '0.2964', 'num_input_tokens_seen': 24099331, 'train_runtime': '1.219e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2613', 'grad_norm': '0.8488', 'learning_rate': '4.98e-05', 'epoch': '0.2964', 'num_input_tokens_seen': 24101378, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7855', 'grad_norm': '1.335', 'learning_rate': '4.98e-05', 'epoch': '0.2965', 'num_input_tokens_seen': 24103425, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.565', 'grad_norm': '1.319', 'learning_rate': '4.98e-05', 'epoch': '0.2965', 'num_input_tokens_seen': 24105472, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6901', 'grad_norm': '1.229', 'learning_rate': '4.98e-05', 'epoch': '0.2965', 'num_input_tokens_seen': 24107519, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.742', 'grad_norm': '1.962', 'learning_rate': '4.98e-05', 'epoch': '0.2965', 'num_input_tokens_seen': 24109566, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8153', 'grad_norm': '1.303', 'learning_rate': '4.98e-05', 'epoch': '0.2966', 'num_input_tokens_seen': 24111613, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3758', 'grad_norm': '0.7813', 'learning_rate': '4.98e-05', 'epoch': '0.2966', 'num_input_tokens_seen': 24113660, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5367', 'grad_norm': '1.088', 'learning_rate': '4.98e-05', 'epoch': '0.2966', 'num_input_tokens_seen': 24115707, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.651', 'grad_norm': '1.311', 'learning_rate': '4.98e-05', 'epoch': '0.2966', 'num_input_tokens_seen': 24117754, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4783', 'grad_norm': '1.043', 'learning_rate': '4.98e-05', 'epoch': '0.2967', 'num_input_tokens_seen': 24119801, 'train_runtime': '1.22e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.344', 'grad_norm': '2.221', 'learning_rate': '4.98e-05', 'epoch': '0.2967', 'num_input_tokens_seen': 24121848, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6485', 'grad_norm': '1.36', 'learning_rate': '4.98e-05', 'epoch': '0.2967', 'num_input_tokens_seen': 24123895, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5925', 'grad_norm': '1.305', 'learning_rate': '4.98e-05', 'epoch': '0.2967', 'num_input_tokens_seen': 24125942, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2066', 'grad_norm': '0.8098', 'learning_rate': '4.98e-05', 'epoch': '0.2968', 'num_input_tokens_seen': 24127989, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.294', 'grad_norm': '2.22', 'learning_rate': '4.98e-05', 'epoch': '0.2968', 'num_input_tokens_seen': 24130036, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2744', 'grad_norm': '0.9294', 'learning_rate': '4.98e-05', 'epoch': '0.2968', 'num_input_tokens_seen': 24132083, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2835', 'grad_norm': '0.9723', 'learning_rate': '4.98e-05', 'epoch': '0.2969', 'num_input_tokens_seen': 24134130, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.393', 'grad_norm': '0.9408', 'learning_rate': '4.98e-05', 'epoch': '0.2969', 'num_input_tokens_seen': 24136177, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8989', 'grad_norm': '2.016', 'learning_rate': '4.98e-05', 'epoch': '0.2969', 'num_input_tokens_seen': 24138224, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5634', 'grad_norm': '1.042', 'learning_rate': '4.98e-05', 'epoch': '0.2969', 'num_input_tokens_seen': 24140271, 'train_runtime': '1.221e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6732', 'grad_norm': '1.21', 'learning_rate': '4.98e-05', 'epoch': '0.297', 'num_input_tokens_seen': 24142318, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5823', 'grad_norm': '1.621', 'learning_rate': '4.98e-05', 'epoch': '0.297', 'num_input_tokens_seen': 24144365, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.769', 'grad_norm': '1.404', 'learning_rate': '4.98e-05', 'epoch': '0.297', 'num_input_tokens_seen': 24146412, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.663', 'grad_norm': '2.37', 'learning_rate': '4.98e-05', 'epoch': '0.297', 'num_input_tokens_seen': 24148459, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.162', 'grad_norm': '1.727', 'learning_rate': '4.98e-05', 'epoch': '0.2971', 'num_input_tokens_seen': 24150506, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3891', 'grad_norm': '0.9835', 'learning_rate': '4.98e-05', 'epoch': '0.2971', 'num_input_tokens_seen': 24152553, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3888', 'grad_norm': '1.343', 'learning_rate': '4.98e-05', 'epoch': '0.2971', 'num_input_tokens_seen': 24154600, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.218', 'grad_norm': '2.238', 'learning_rate': '4.98e-05', 'epoch': '0.2971', 'num_input_tokens_seen': 24156647, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.081', 'grad_norm': '1.665', 'learning_rate': '4.98e-05', 'epoch': '0.2972', 'num_input_tokens_seen': 24158694, 'train_runtime': '1.222e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7775', 'grad_norm': '1.661', 'learning_rate': '4.98e-05', 'epoch': '0.2972', 'num_input_tokens_seen': 24160741, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7979', 'grad_norm': '1.292', 'learning_rate': '4.98e-05', 'epoch': '0.2972', 'num_input_tokens_seen': 24162788, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.948', 'grad_norm': '1.388', 'learning_rate': '4.98e-05', 'epoch': '0.2972', 'num_input_tokens_seen': 24164835, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.681', 'grad_norm': '1.261', 'learning_rate': '4.98e-05', 'epoch': '0.2973', 'num_input_tokens_seen': 24166882, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.556', 'grad_norm': '1.413', 'learning_rate': '4.98e-05', 'epoch': '0.2973', 'num_input_tokens_seen': 24168929, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.025', 'grad_norm': '2.296', 'learning_rate': '4.98e-05', 'epoch': '0.2973', 'num_input_tokens_seen': 24170976, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8754', 'grad_norm': '1.232', 'learning_rate': '4.98e-05', 'epoch': '0.2973', 'num_input_tokens_seen': 24173023, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7709', 'grad_norm': '1.137', 'learning_rate': '4.98e-05', 'epoch': '0.2974', 'num_input_tokens_seen': 24175070, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3117', 'grad_norm': '0.7907', 'learning_rate': '4.98e-05', 'epoch': '0.2974', 'num_input_tokens_seen': 24177117, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.293', 'grad_norm': '1.981', 'learning_rate': '4.98e-05', 'epoch': '0.2974', 'num_input_tokens_seen': 24179164, 'train_runtime': '1.223e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2549', 'grad_norm': '0.8803', 'learning_rate': '4.98e-05', 'epoch': '0.2974', 'num_input_tokens_seen': 24181211, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.751', 'grad_norm': '1.891', 'learning_rate': '4.98e-05', 'epoch': '0.2975', 'num_input_tokens_seen': 24183258, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.36', 'grad_norm': '2.38', 'learning_rate': '4.98e-05', 'epoch': '0.2975', 'num_input_tokens_seen': 24185305, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.719', 'grad_norm': '1.057', 'learning_rate': '4.98e-05', 'epoch': '0.2975', 'num_input_tokens_seen': 24187352, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5467', 'grad_norm': '0.8791', 'learning_rate': '4.98e-05', 'epoch': '0.2975', 'num_input_tokens_seen': 24189399, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.379', 'grad_norm': '0.9149', 'learning_rate': '4.98e-05', 'epoch': '0.2976', 'num_input_tokens_seen': 24191446, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8157', 'grad_norm': '1.553', 'learning_rate': '4.98e-05', 'epoch': '0.2976', 'num_input_tokens_seen': 24193493, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6902', 'grad_norm': '1.262', 'learning_rate': '4.98e-05', 'epoch': '0.2976', 'num_input_tokens_seen': 24195540, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4234', 'grad_norm': '0.8535', 'learning_rate': '4.98e-05', 'epoch': '0.2976', 'num_input_tokens_seen': 24197587, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5912', 'grad_norm': '1.259', 'learning_rate': '4.98e-05', 'epoch': '0.2977', 'num_input_tokens_seen': 24199634, 'train_runtime': '1.224e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.46', 'grad_norm': '2.27', 'learning_rate': '4.98e-05', 'epoch': '0.2977', 'num_input_tokens_seen': 24201681, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.264', 'grad_norm': '2.216', 'learning_rate': '4.98e-05', 'epoch': '0.2977', 'num_input_tokens_seen': 24203728, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9671', 'grad_norm': '1.448', 'learning_rate': '4.98e-05', 'epoch': '0.2977', 'num_input_tokens_seen': 24205775, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.274', 'grad_norm': '2.342', 'learning_rate': '4.98e-05', 'epoch': '0.2978', 'num_input_tokens_seen': 24207822, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.105', 'grad_norm': '1.735', 'learning_rate': '4.98e-05', 'epoch': '0.2978', 'num_input_tokens_seen': 24209869, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2266', 'grad_norm': '0.882', 'learning_rate': '4.98e-05', 'epoch': '0.2978', 'num_input_tokens_seen': 24211916, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4344', 'grad_norm': '1.039', 'learning_rate': '4.98e-05', 'epoch': '0.2978', 'num_input_tokens_seen': 24213963, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4563', 'grad_norm': '1.192', 'learning_rate': '4.98e-05', 'epoch': '0.2979', 'num_input_tokens_seen': 24216010, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4376', 'grad_norm': '0.8252', 'learning_rate': '4.98e-05', 'epoch': '0.2979', 'num_input_tokens_seen': 24218057, 'train_runtime': '1.225e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3144', 'grad_norm': '0.7878', 'learning_rate': '4.98e-05', 'epoch': '0.2979', 'num_input_tokens_seen': 24220104, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6327', 'grad_norm': '0.8852', 'learning_rate': '4.98e-05', 'epoch': '0.2979', 'num_input_tokens_seen': 24222151, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6875', 'grad_norm': '1.308', 'learning_rate': '4.98e-05', 'epoch': '0.298', 'num_input_tokens_seen': 24224198, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6465', 'grad_norm': '1.23', 'learning_rate': '4.98e-05', 'epoch': '0.298', 'num_input_tokens_seen': 24226245, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.432', 'grad_norm': '0.9467', 'learning_rate': '4.98e-05', 'epoch': '0.298', 'num_input_tokens_seen': 24228292, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2422', 'grad_norm': '0.9226', 'learning_rate': '4.98e-05', 'epoch': '0.298', 'num_input_tokens_seen': 24230339, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7991', 'grad_norm': '1.719', 'learning_rate': '4.98e-05', 'epoch': '0.2981', 'num_input_tokens_seen': 24232386, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3541', 'grad_norm': '0.8474', 'learning_rate': '4.98e-05', 'epoch': '0.2981', 'num_input_tokens_seen': 24234433, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6627', 'grad_norm': '1.261', 'learning_rate': '4.98e-05', 'epoch': '0.2981', 'num_input_tokens_seen': 24236480, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8792', 'grad_norm': '1.657', 'learning_rate': '4.98e-05', 'epoch': '0.2981', 'num_input_tokens_seen': 24238527, 'train_runtime': '1.226e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.077', 'grad_norm': '1.552', 'learning_rate': '4.98e-05', 'epoch': '0.2982', 'num_input_tokens_seen': 24240574, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2868', 'grad_norm': '0.8437', 'learning_rate': '4.98e-05', 'epoch': '0.2982', 'num_input_tokens_seen': 24242621, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4644', 'grad_norm': '1', 'learning_rate': '4.98e-05', 'epoch': '0.2982', 'num_input_tokens_seen': 24244668, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5818', 'grad_norm': '1.237', 'learning_rate': '4.98e-05', 'epoch': '0.2982', 'num_input_tokens_seen': 24246715, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3235', 'grad_norm': '1.105', 'learning_rate': '4.98e-05', 'epoch': '0.2983', 'num_input_tokens_seen': 24248762, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5923', 'grad_norm': '1.125', 'learning_rate': '4.98e-05', 'epoch': '0.2983', 'num_input_tokens_seen': 24250809, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4118', 'grad_norm': '1.018', 'learning_rate': '4.98e-05', 'epoch': '0.2983', 'num_input_tokens_seen': 24252856, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5383', 'grad_norm': '1.264', 'learning_rate': '4.98e-05', 'epoch': '0.2983', 'num_input_tokens_seen': 24254903, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5723', 'grad_norm': '0.8904', 'learning_rate': '4.98e-05', 'epoch': '0.2984', 'num_input_tokens_seen': 24256950, 'train_runtime': '1.227e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3828', 'grad_norm': '1.042', 'learning_rate': '4.98e-05', 'epoch': '0.2984', 'num_input_tokens_seen': 24258997, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7702', 'grad_norm': '1.459', 'learning_rate': '4.98e-05', 'epoch': '0.2984', 'num_input_tokens_seen': 24261044, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3306', 'grad_norm': '0.7624', 'learning_rate': '4.98e-05', 'epoch': '0.2984', 'num_input_tokens_seen': 24263091, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9078', 'grad_norm': '1.541', 'learning_rate': '4.98e-05', 'epoch': '0.2985', 'num_input_tokens_seen': 24265138, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3303', 'grad_norm': '0.8821', 'learning_rate': '4.98e-05', 'epoch': '0.2985', 'num_input_tokens_seen': 24267185, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8172', 'grad_norm': '1.209', 'learning_rate': '4.98e-05', 'epoch': '0.2985', 'num_input_tokens_seen': 24269232, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9366', 'grad_norm': '1.827', 'learning_rate': '4.98e-05', 'epoch': '0.2985', 'num_input_tokens_seen': 24271279, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.385', 'grad_norm': '2.412', 'learning_rate': '4.98e-05', 'epoch': '0.2986', 'num_input_tokens_seen': 24273326, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3474', 'grad_norm': '1.037', 'learning_rate': '4.98e-05', 'epoch': '0.2986', 'num_input_tokens_seen': 24275373, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8373', 'grad_norm': '1.496', 'learning_rate': '4.98e-05', 'epoch': '0.2986', 'num_input_tokens_seen': 24277420, 'train_runtime': '1.228e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.058', 'grad_norm': '1.955', 'learning_rate': '4.98e-05', 'epoch': '0.2986', 'num_input_tokens_seen': 24279467, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.52', 'grad_norm': '2.479', 'learning_rate': '4.98e-05', 'epoch': '0.2987', 'num_input_tokens_seen': 24281514, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.051', 'grad_norm': '1.649', 'learning_rate': '4.98e-05', 'epoch': '0.2987', 'num_input_tokens_seen': 24283561, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.33', 'grad_norm': '2.308', 'learning_rate': '4.98e-05', 'epoch': '0.2987', 'num_input_tokens_seen': 24285608, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.318', 'grad_norm': '1.804', 'learning_rate': '4.98e-05', 'epoch': '0.2987', 'num_input_tokens_seen': 24287655, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3856', 'grad_norm': '0.8912', 'learning_rate': '4.98e-05', 'epoch': '0.2988', 'num_input_tokens_seen': 24289702, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.26', 'grad_norm': '2.043', 'learning_rate': '4.98e-05', 'epoch': '0.2988', 'num_input_tokens_seen': 24291749, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.065', 'grad_norm': '2.752', 'learning_rate': '4.98e-05', 'epoch': '0.2988', 'num_input_tokens_seen': 24293796, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4905', 'grad_norm': '1.038', 'learning_rate': '4.98e-05', 'epoch': '0.2988', 'num_input_tokens_seen': 24295843, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7232', 'grad_norm': '1.626', 'learning_rate': '4.98e-05', 'epoch': '0.2989', 'num_input_tokens_seen': 24297890, 'train_runtime': '1.229e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2887', 'grad_norm': '1.001', 'learning_rate': '4.98e-05', 'epoch': '0.2989', 'num_input_tokens_seen': 24299937, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5494', 'grad_norm': '1.183', 'learning_rate': '4.98e-05', 'epoch': '0.2989', 'num_input_tokens_seen': 24301984, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.048', 'grad_norm': '1.769', 'learning_rate': '4.98e-05', 'epoch': '0.2989', 'num_input_tokens_seen': 24304031, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.494', 'grad_norm': '2.396', 'learning_rate': '4.98e-05', 'epoch': '0.299', 'num_input_tokens_seen': 24306078, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.251', 'grad_norm': '0.7604', 'learning_rate': '4.98e-05', 'epoch': '0.299', 'num_input_tokens_seen': 24308125, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3615', 'grad_norm': '0.8589', 'learning_rate': '4.98e-05', 'epoch': '0.299', 'num_input_tokens_seen': 24310172, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2815', 'grad_norm': '0.8022', 'learning_rate': '4.98e-05', 'epoch': '0.299', 'num_input_tokens_seen': 24312219, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5265', 'grad_norm': '1.097', 'learning_rate': '4.98e-05', 'epoch': '0.2991', 'num_input_tokens_seen': 24314266, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.959', 'grad_norm': '1.932', 'learning_rate': '4.98e-05', 'epoch': '0.2991', 'num_input_tokens_seen': 24316313, 'train_runtime': '1.23e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.18', 'grad_norm': '1.732', 'learning_rate': '4.98e-05', 'epoch': '0.2991', 'num_input_tokens_seen': 24318360, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.919', 'grad_norm': '1.38', 'learning_rate': '4.98e-05', 'epoch': '0.2991', 'num_input_tokens_seen': 24320407, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.183', 'grad_norm': '1.756', 'learning_rate': '4.98e-05', 'epoch': '0.2992', 'num_input_tokens_seen': 24322454, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9942', 'grad_norm': '1.26', 'learning_rate': '4.98e-05', 'epoch': '0.2992', 'num_input_tokens_seen': 24324501, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8439', 'grad_norm': '1.333', 'learning_rate': '4.98e-05', 'epoch': '0.2992', 'num_input_tokens_seen': 24326548, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.807', 'grad_norm': '3.1', 'learning_rate': '4.98e-05', 'epoch': '0.2992', 'num_input_tokens_seen': 24328595, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.504', 'grad_norm': '2.244', 'learning_rate': '4.98e-05', 'epoch': '0.2993', 'num_input_tokens_seen': 24330642, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.031', 'grad_norm': '2.905', 'learning_rate': '4.98e-05', 'epoch': '0.2993', 'num_input_tokens_seen': 24332689, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3448', 'grad_norm': '0.8682', 'learning_rate': '4.98e-05', 'epoch': '0.2993', 'num_input_tokens_seen': 24334736, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2998', 'grad_norm': '0.6917', 'learning_rate': '4.98e-05', 'epoch': '0.2993', 'num_input_tokens_seen': 24336783, 'train_runtime': '1.231e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7579', 'grad_norm': '1.384', 'learning_rate': '4.98e-05', 'epoch': '0.2994', 'num_input_tokens_seen': 24338830, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.701', 'grad_norm': '2.456', 'learning_rate': '4.98e-05', 'epoch': '0.2994', 'num_input_tokens_seen': 24340877, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6269', 'grad_norm': '1.048', 'learning_rate': '4.98e-05', 'epoch': '0.2994', 'num_input_tokens_seen': 24342924, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9264', 'grad_norm': '1.53', 'learning_rate': '4.98e-05', 'epoch': '0.2994', 'num_input_tokens_seen': 24344971, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2148', 'grad_norm': '0.9618', 'learning_rate': '4.98e-05', 'epoch': '0.2995', 'num_input_tokens_seen': 24347018, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9729', 'grad_norm': '2.015', 'learning_rate': '4.98e-05', 'epoch': '0.2995', 'num_input_tokens_seen': 24349065, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9559', 'grad_norm': '1.402', 'learning_rate': '4.98e-05', 'epoch': '0.2995', 'num_input_tokens_seen': 24351112, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4017', 'grad_norm': '1.077', 'learning_rate': '4.98e-05', 'epoch': '0.2995', 'num_input_tokens_seen': 24353159, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5504', 'grad_norm': '1.264', 'learning_rate': '4.98e-05', 'epoch': '0.2996', 'num_input_tokens_seen': 24355206, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5051', 'grad_norm': '1.122', 'learning_rate': '4.98e-05', 'epoch': '0.2996', 'num_input_tokens_seen': 24357253, 'train_runtime': '1.232e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3815', 'grad_norm': '0.8704', 'learning_rate': '4.98e-05', 'epoch': '0.2996', 'num_input_tokens_seen': 24359300, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.021', 'grad_norm': '2.58', 'learning_rate': '4.98e-05', 'epoch': '0.2996', 'num_input_tokens_seen': 24361347, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.353', 'grad_norm': '2.333', 'learning_rate': '4.98e-05', 'epoch': '0.2997', 'num_input_tokens_seen': 24363394, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7998', 'grad_norm': '1.28', 'learning_rate': '4.98e-05', 'epoch': '0.2997', 'num_input_tokens_seen': 24365441, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6177', 'grad_norm': '1.605', 'learning_rate': '4.98e-05', 'epoch': '0.2997', 'num_input_tokens_seen': 24367488, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.234', 'grad_norm': '1.954', 'learning_rate': '4.98e-05', 'epoch': '0.2997', 'num_input_tokens_seen': 24369535, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.628', 'grad_norm': '2.729', 'learning_rate': '4.98e-05', 'epoch': '0.2998', 'num_input_tokens_seen': 24371582, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4764', 'grad_norm': '0.9715', 'learning_rate': '4.98e-05', 'epoch': '0.2998', 'num_input_tokens_seen': 24373629, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.098', 'grad_norm': '2.692', 'learning_rate': '4.98e-05', 'epoch': '0.2998', 'num_input_tokens_seen': 24375676, 'train_runtime': '1.233e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6935', 'grad_norm': '1.391', 'learning_rate': '4.98e-05', 'epoch': '0.2998', 'num_input_tokens_seen': 24377723, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4921', 'grad_norm': '1.256', 'learning_rate': '4.98e-05', 'epoch': '0.2999', 'num_input_tokens_seen': 24379770, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5346', 'grad_norm': '1.308', 'learning_rate': '4.98e-05', 'epoch': '0.2999', 'num_input_tokens_seen': 24381817, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7764', 'grad_norm': '1.47', 'learning_rate': '4.979e-05', 'epoch': '0.2999', 'num_input_tokens_seen': 24383864, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3183', 'grad_norm': '0.7116', 'learning_rate': '4.979e-05', 'epoch': '0.2999', 'num_input_tokens_seen': 24385911, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3926', 'grad_norm': '0.9663', 'learning_rate': '4.979e-05', 'epoch': '0.3', 'num_input_tokens_seen': 24387958, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.275', 'grad_norm': '2.143', 'learning_rate': '4.979e-05', 'epoch': '0.3', 'num_input_tokens_seen': 24390005, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7805', 'grad_norm': '1.762', 'learning_rate': '4.979e-05', 'epoch': '0.3', 'num_input_tokens_seen': 24392052, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2402', 'grad_norm': '0.7989', 'learning_rate': '4.979e-05', 'epoch': '0.3', 'num_input_tokens_seen': 24394099, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4724', 'grad_norm': '1.08', 'learning_rate': '4.979e-05', 'epoch': '0.3001', 'num_input_tokens_seen': 24396146, 'train_runtime': '1.234e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6262', 'grad_norm': '1.061', 'learning_rate': '4.979e-05', 'epoch': '0.3001', 'num_input_tokens_seen': 24398193, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.602', 'grad_norm': '2.483', 'learning_rate': '4.979e-05', 'epoch': '0.3001', 'num_input_tokens_seen': 24400240, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3246', 'grad_norm': '0.943', 'learning_rate': '4.979e-05', 'epoch': '0.3001', 'num_input_tokens_seen': 24402287, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3223', 'grad_norm': '0.8425', 'learning_rate': '4.979e-05', 'epoch': '0.3002', 'num_input_tokens_seen': 24404334, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5252', 'grad_norm': '1.253', 'learning_rate': '4.979e-05', 'epoch': '0.3002', 'num_input_tokens_seen': 24406381, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2237', 'grad_norm': '0.8223', 'learning_rate': '4.979e-05', 'epoch': '0.3002', 'num_input_tokens_seen': 24408428, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4747', 'grad_norm': '1.012', 'learning_rate': '4.979e-05', 'epoch': '0.3002', 'num_input_tokens_seen': 24410475, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.003', 'grad_norm': '1.181', 'learning_rate': '4.979e-05', 'epoch': '0.3003', 'num_input_tokens_seen': 24412522, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7134', 'grad_norm': '1.12', 'learning_rate': '4.979e-05', 'epoch': '0.3003', 'num_input_tokens_seen': 24414569, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.361', 'grad_norm': '1.858', 'learning_rate': '4.979e-05', 'epoch': '0.3003', 'num_input_tokens_seen': 24416616, 'train_runtime': '1.235e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6145', 'grad_norm': '1.18', 'learning_rate': '4.979e-05', 'epoch': '0.3003', 'num_input_tokens_seen': 24418663, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3991', 'grad_norm': '0.8791', 'learning_rate': '4.979e-05', 'epoch': '0.3004', 'num_input_tokens_seen': 24420710, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2974', 'grad_norm': '1.097', 'learning_rate': '4.979e-05', 'epoch': '0.3004', 'num_input_tokens_seen': 24422757, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6439', 'grad_norm': '1.087', 'learning_rate': '4.979e-05', 'epoch': '0.3004', 'num_input_tokens_seen': 24424804, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5867', 'grad_norm': '1.385', 'learning_rate': '4.979e-05', 'epoch': '0.3005', 'num_input_tokens_seen': 24426851, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4454', 'grad_norm': '1.029', 'learning_rate': '4.979e-05', 'epoch': '0.3005', 'num_input_tokens_seen': 24428898, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4249', 'grad_norm': '1.224', 'learning_rate': '4.979e-05', 'epoch': '0.3005', 'num_input_tokens_seen': 24430945, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.118', 'grad_norm': '1.881', 'learning_rate': '4.979e-05', 'epoch': '0.3005', 'num_input_tokens_seen': 24432992, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.326', 'grad_norm': '0.8147', 'learning_rate': '4.979e-05', 'epoch': '0.3006', 'num_input_tokens_seen': 24435039, 'train_runtime': '1.236e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7182', 'grad_norm': '1.348', 'learning_rate': '4.979e-05', 'epoch': '0.3006', 'num_input_tokens_seen': 24437086, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.186', 'grad_norm': '1.959', 'learning_rate': '4.979e-05', 'epoch': '0.3006', 'num_input_tokens_seen': 24439133, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5865', 'grad_norm': '1.37', 'learning_rate': '4.979e-05', 'epoch': '0.3006', 'num_input_tokens_seen': 24441180, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7279', 'grad_norm': '1.49', 'learning_rate': '4.979e-05', 'epoch': '0.3007', 'num_input_tokens_seen': 24443227, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5859', 'grad_norm': '1.378', 'learning_rate': '4.979e-05', 'epoch': '0.3007', 'num_input_tokens_seen': 24445274, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8453', 'grad_norm': '1.444', 'learning_rate': '4.979e-05', 'epoch': '0.3007', 'num_input_tokens_seen': 24447321, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2188', 'grad_norm': '0.834', 'learning_rate': '4.979e-05', 'epoch': '0.3007', 'num_input_tokens_seen': 24449368, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.695', 'grad_norm': '3.125', 'learning_rate': '4.979e-05', 'epoch': '0.3008', 'num_input_tokens_seen': 24451415, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6802', 'grad_norm': '1.317', 'learning_rate': '4.979e-05', 'epoch': '0.3008', 'num_input_tokens_seen': 24453462, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3097', 'grad_norm': '0.9044', 'learning_rate': '4.979e-05', 'epoch': '0.3008', 'num_input_tokens_seen': 24455509, 'train_runtime': '1.237e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.857', 'grad_norm': '1.644', 'learning_rate': '4.979e-05', 'epoch': '0.3008', 'num_input_tokens_seen': 24457556, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8796', 'grad_norm': '1.422', 'learning_rate': '4.979e-05', 'epoch': '0.3009', 'num_input_tokens_seen': 24459603, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9679', 'grad_norm': '1.44', 'learning_rate': '4.979e-05', 'epoch': '0.3009', 'num_input_tokens_seen': 24461650, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2539', 'grad_norm': '0.8265', 'learning_rate': '4.979e-05', 'epoch': '0.3009', 'num_input_tokens_seen': 24463697, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6093', 'grad_norm': '1.136', 'learning_rate': '4.979e-05', 'epoch': '0.3009', 'num_input_tokens_seen': 24465744, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2896', 'grad_norm': '1.006', 'learning_rate': '4.979e-05', 'epoch': '0.301', 'num_input_tokens_seen': 24467791, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4093', 'grad_norm': '0.8918', 'learning_rate': '4.979e-05', 'epoch': '0.301', 'num_input_tokens_seen': 24469838, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.146', 'grad_norm': '2.232', 'learning_rate': '4.979e-05', 'epoch': '0.301', 'num_input_tokens_seen': 24471885, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.272', 'grad_norm': '2.03', 'learning_rate': '4.979e-05', 'epoch': '0.301', 'num_input_tokens_seen': 24473932, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2656', 'grad_norm': '0.9283', 'learning_rate': '4.979e-05', 'epoch': '0.3011', 'num_input_tokens_seen': 24475979, 'train_runtime': '1.238e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.37', 'grad_norm': '1.753', 'learning_rate': '4.979e-05', 'epoch': '0.3011', 'num_input_tokens_seen': 24478026, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8204', 'grad_norm': '1.248', 'learning_rate': '4.979e-05', 'epoch': '0.3011', 'num_input_tokens_seen': 24480073, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7823', 'grad_norm': '1.822', 'learning_rate': '4.979e-05', 'epoch': '0.3011', 'num_input_tokens_seen': 24482120, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6836', 'grad_norm': '1.81', 'learning_rate': '4.979e-05', 'epoch': '0.3012', 'num_input_tokens_seen': 24484167, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6487', 'grad_norm': '1.333', 'learning_rate': '4.979e-05', 'epoch': '0.3012', 'num_input_tokens_seen': 24486214, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.144', 'grad_norm': '1.856', 'learning_rate': '4.979e-05', 'epoch': '0.3012', 'num_input_tokens_seen': 24488261, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3246', 'grad_norm': '0.812', 'learning_rate': '4.979e-05', 'epoch': '0.3012', 'num_input_tokens_seen': 24490308, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5235', 'grad_norm': '1.074', 'learning_rate': '4.979e-05', 'epoch': '0.3013', 'num_input_tokens_seen': 24492355, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6232', 'grad_norm': '1.204', 'learning_rate': '4.979e-05', 'epoch': '0.3013', 'num_input_tokens_seen': 24494402, 'train_runtime': '1.239e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6951', 'grad_norm': '1.63', 'learning_rate': '4.979e-05', 'epoch': '0.3013', 'num_input_tokens_seen': 24496449, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9506', 'grad_norm': '1.527', 'learning_rate': '4.979e-05', 'epoch': '0.3013', 'num_input_tokens_seen': 24498496, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7492', 'grad_norm': '1.174', 'learning_rate': '4.979e-05', 'epoch': '0.3014', 'num_input_tokens_seen': 24500543, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.335', 'grad_norm': '2.518', 'learning_rate': '4.979e-05', 'epoch': '0.3014', 'num_input_tokens_seen': 24502590, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.132', 'grad_norm': '2.251', 'learning_rate': '4.979e-05', 'epoch': '0.3014', 'num_input_tokens_seen': 24504637, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.878', 'grad_norm': '1.42', 'learning_rate': '4.979e-05', 'epoch': '0.3014', 'num_input_tokens_seen': 24506684, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3413', 'grad_norm': '0.9013', 'learning_rate': '4.979e-05', 'epoch': '0.3015', 'num_input_tokens_seen': 24508731, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8095', 'grad_norm': '1.521', 'learning_rate': '4.979e-05', 'epoch': '0.3015', 'num_input_tokens_seen': 24510778, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3832', 'grad_norm': '0.9369', 'learning_rate': '4.979e-05', 'epoch': '0.3015', 'num_input_tokens_seen': 24512825, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.411', 'grad_norm': '1.082', 'learning_rate': '4.979e-05', 'epoch': '0.3015', 'num_input_tokens_seen': 24514872, 'train_runtime': '1.24e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3889', 'grad_norm': '0.8997', 'learning_rate': '4.979e-05', 'epoch': '0.3016', 'num_input_tokens_seen': 24516919, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3874', 'grad_norm': '1.113', 'learning_rate': '4.979e-05', 'epoch': '0.3016', 'num_input_tokens_seen': 24518966, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2321', 'grad_norm': '0.9238', 'learning_rate': '4.979e-05', 'epoch': '0.3016', 'num_input_tokens_seen': 24521013, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4754', 'grad_norm': '0.9935', 'learning_rate': '4.979e-05', 'epoch': '0.3016', 'num_input_tokens_seen': 24523060, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.264', 'grad_norm': '2.065', 'learning_rate': '4.979e-05', 'epoch': '0.3017', 'num_input_tokens_seen': 24525107, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.238', 'grad_norm': '0.8082', 'learning_rate': '4.979e-05', 'epoch': '0.3017', 'num_input_tokens_seen': 24527154, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7906', 'grad_norm': '1.332', 'learning_rate': '4.979e-05', 'epoch': '0.3017', 'num_input_tokens_seen': 24529201, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9907', 'grad_norm': '2.358', 'learning_rate': '4.979e-05', 'epoch': '0.3017', 'num_input_tokens_seen': 24531248, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8838', 'grad_norm': '1.381', 'learning_rate': '4.979e-05', 'epoch': '0.3018', 'num_input_tokens_seen': 24533295, 'train_runtime': '1.241e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9564', 'grad_norm': '2.466', 'learning_rate': '4.979e-05', 'epoch': '0.3018', 'num_input_tokens_seen': 24535342, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3136', 'grad_norm': '0.8185', 'learning_rate': '4.979e-05', 'epoch': '0.3018', 'num_input_tokens_seen': 24537389, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8592', 'grad_norm': '1.368', 'learning_rate': '4.979e-05', 'epoch': '0.3018', 'num_input_tokens_seen': 24539436, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8354', 'grad_norm': '1.243', 'learning_rate': '4.979e-05', 'epoch': '0.3019', 'num_input_tokens_seen': 24541483, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5017', 'grad_norm': '1.16', 'learning_rate': '4.979e-05', 'epoch': '0.3019', 'num_input_tokens_seen': 24543530, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2257', 'grad_norm': '0.8196', 'learning_rate': '4.979e-05', 'epoch': '0.3019', 'num_input_tokens_seen': 24545577, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3697', 'grad_norm': '1.251', 'learning_rate': '4.979e-05', 'epoch': '0.3019', 'num_input_tokens_seen': 24547624, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.479', 'grad_norm': '1.308', 'learning_rate': '4.979e-05', 'epoch': '0.302', 'num_input_tokens_seen': 24549671, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.02', 'grad_norm': '1.517', 'learning_rate': '4.979e-05', 'epoch': '0.302', 'num_input_tokens_seen': 24551718, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6575', 'grad_norm': '1.838', 'learning_rate': '4.979e-05', 'epoch': '0.302', 'num_input_tokens_seen': 24553765, 'train_runtime': '1.242e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.198', 'grad_norm': '0.8189', 'learning_rate': '4.979e-05', 'epoch': '0.302', 'num_input_tokens_seen': 24555812, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6175', 'grad_norm': '1.189', 'learning_rate': '4.979e-05', 'epoch': '0.3021', 'num_input_tokens_seen': 24557859, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.927', 'grad_norm': '1.42', 'learning_rate': '4.979e-05', 'epoch': '0.3021', 'num_input_tokens_seen': 24559906, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.507', 'grad_norm': '1.43', 'learning_rate': '4.979e-05', 'epoch': '0.3021', 'num_input_tokens_seen': 24561953, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8244', 'grad_norm': '1.279', 'learning_rate': '4.979e-05', 'epoch': '0.3021', 'num_input_tokens_seen': 24564000, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +[INFO|configuration_utils.py:665] 2026-02-05 06:04:35,024 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 06:04:35,024 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 06:04:35,539 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-12000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 06:04:35,545 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-12000/tokenizer_config.json + +{'loss': '0.7439', 'grad_norm': '1.124', 'learning_rate': '4.979e-05', 'epoch': '0.3022', 'num_input_tokens_seen': 24566047, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3558', 'grad_norm': '0.9591', 'learning_rate': '4.979e-05', 'epoch': '0.3022', 'num_input_tokens_seen': 24568094, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4007', 'grad_norm': '0.9082', 'learning_rate': '4.979e-05', 'epoch': '0.3022', 'num_input_tokens_seen': 24570141, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.018', 'grad_norm': '1.256', 'learning_rate': '4.979e-05', 'epoch': '0.3022', 'num_input_tokens_seen': 24572188, 'train_runtime': '1.243e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4725', 'grad_norm': '1.235', 'learning_rate': '4.979e-05', 'epoch': '0.3023', 'num_input_tokens_seen': 24574235, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7633', 'grad_norm': '1.412', 'learning_rate': '4.979e-05', 'epoch': '0.3023', 'num_input_tokens_seen': 24576282, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2446', 'grad_norm': '0.9949', 'learning_rate': '4.979e-05', 'epoch': '0.3023', 'num_input_tokens_seen': 24578329, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2364', 'grad_norm': '0.8317', 'learning_rate': '4.979e-05', 'epoch': '0.3023', 'num_input_tokens_seen': 24580376, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.449', 'grad_norm': '0.9472', 'learning_rate': '4.979e-05', 'epoch': '0.3024', 'num_input_tokens_seen': 24582423, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2356', 'grad_norm': '2.83', 'learning_rate': '4.979e-05', 'epoch': '0.3024', 'num_input_tokens_seen': 24584470, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2607', 'grad_norm': '0.7481', 'learning_rate': '4.979e-05', 'epoch': '0.3024', 'num_input_tokens_seen': 24586517, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '1.889', 'learning_rate': '4.979e-05', 'epoch': '0.3024', 'num_input_tokens_seen': 24588564, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7371', 'grad_norm': '1.251', 'learning_rate': '4.979e-05', 'epoch': '0.3025', 'num_input_tokens_seen': 24590611, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.609', 'grad_norm': '2.75', 'learning_rate': '4.979e-05', 'epoch': '0.3025', 'num_input_tokens_seen': 24592658, 'train_runtime': '1.244e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.69', 'grad_norm': '1.798', 'learning_rate': '4.979e-05', 'epoch': '0.3025', 'num_input_tokens_seen': 24594705, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.145', 'grad_norm': '1.393', 'learning_rate': '4.979e-05', 'epoch': '0.3025', 'num_input_tokens_seen': 24596752, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.295', 'grad_norm': '2.192', 'learning_rate': '4.979e-05', 'epoch': '0.3026', 'num_input_tokens_seen': 24598799, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2877', 'grad_norm': '0.7637', 'learning_rate': '4.979e-05', 'epoch': '0.3026', 'num_input_tokens_seen': 24600846, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5454', 'grad_norm': '1.295', 'learning_rate': '4.979e-05', 'epoch': '0.3026', 'num_input_tokens_seen': 24602893, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.472', 'grad_norm': '1.281', 'learning_rate': '4.979e-05', 'epoch': '0.3026', 'num_input_tokens_seen': 24604940, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5831', 'grad_norm': '0.9894', 'learning_rate': '4.979e-05', 'epoch': '0.3027', 'num_input_tokens_seen': 24606987, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.294', 'grad_norm': '0.9574', 'learning_rate': '4.979e-05', 'epoch': '0.3027', 'num_input_tokens_seen': 24609034, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2436', 'grad_norm': '0.8911', 'learning_rate': '4.979e-05', 'epoch': '0.3027', 'num_input_tokens_seen': 24611081, 'train_runtime': '1.245e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5838', 'grad_norm': '1.652', 'learning_rate': '4.979e-05', 'epoch': '0.3027', 'num_input_tokens_seen': 24613128, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7568', 'grad_norm': '2.996', 'learning_rate': '4.979e-05', 'epoch': '0.3028', 'num_input_tokens_seen': 24615175, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4615', 'grad_norm': '1.058', 'learning_rate': '4.979e-05', 'epoch': '0.3028', 'num_input_tokens_seen': 24617222, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9383', 'grad_norm': '1.309', 'learning_rate': '4.979e-05', 'epoch': '0.3028', 'num_input_tokens_seen': 24619269, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8563', 'grad_norm': '1.59', 'learning_rate': '4.979e-05', 'epoch': '0.3028', 'num_input_tokens_seen': 24621316, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4402', 'grad_norm': '1.057', 'learning_rate': '4.979e-05', 'epoch': '0.3029', 'num_input_tokens_seen': 24623363, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3633', 'grad_norm': '1.056', 'learning_rate': '4.979e-05', 'epoch': '0.3029', 'num_input_tokens_seen': 24625410, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6335', 'grad_norm': '1.159', 'learning_rate': '4.979e-05', 'epoch': '0.3029', 'num_input_tokens_seen': 24627457, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6895', 'grad_norm': '1.24', 'learning_rate': '4.979e-05', 'epoch': '0.3029', 'num_input_tokens_seen': 24629504, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.496', 'grad_norm': '2.405', 'learning_rate': '4.979e-05', 'epoch': '0.303', 'num_input_tokens_seen': 24631551, 'train_runtime': '1.246e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7627', 'grad_norm': '1.291', 'learning_rate': '4.979e-05', 'epoch': '0.303', 'num_input_tokens_seen': 24633598, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2969', 'grad_norm': '0.8389', 'learning_rate': '4.979e-05', 'epoch': '0.303', 'num_input_tokens_seen': 24635645, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.92', 'grad_norm': '2.424', 'learning_rate': '4.979e-05', 'epoch': '0.303', 'num_input_tokens_seen': 24637692, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9884', 'grad_norm': '1.388', 'learning_rate': '4.979e-05', 'epoch': '0.3031', 'num_input_tokens_seen': 24639739, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7731', 'grad_norm': '1.2', 'learning_rate': '4.979e-05', 'epoch': '0.3031', 'num_input_tokens_seen': 24641786, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6295', 'grad_norm': '1.134', 'learning_rate': '4.979e-05', 'epoch': '0.3031', 'num_input_tokens_seen': 24643833, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.045', 'grad_norm': '2.097', 'learning_rate': '4.979e-05', 'epoch': '0.3031', 'num_input_tokens_seen': 24645880, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.166', 'grad_norm': '1.881', 'learning_rate': '4.979e-05', 'epoch': '0.3032', 'num_input_tokens_seen': 24647927, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.193', 'grad_norm': '2.224', 'learning_rate': '4.979e-05', 'epoch': '0.3032', 'num_input_tokens_seen': 24649974, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.073', 'grad_norm': '1.502', 'learning_rate': '4.979e-05', 'epoch': '0.3032', 'num_input_tokens_seen': 24652021, 'train_runtime': '1.247e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.203', 'grad_norm': '1.848', 'learning_rate': '4.979e-05', 'epoch': '0.3032', 'num_input_tokens_seen': 24654068, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '1.952', 'learning_rate': '4.979e-05', 'epoch': '0.3033', 'num_input_tokens_seen': 24656115, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6861', 'grad_norm': '1.599', 'learning_rate': '4.979e-05', 'epoch': '0.3033', 'num_input_tokens_seen': 24658162, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3299', 'grad_norm': '0.8851', 'learning_rate': '4.979e-05', 'epoch': '0.3033', 'num_input_tokens_seen': 24660209, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.185', 'grad_norm': '2.096', 'learning_rate': '4.979e-05', 'epoch': '0.3033', 'num_input_tokens_seen': 24662256, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.117', 'grad_norm': '2.161', 'learning_rate': '4.979e-05', 'epoch': '0.3034', 'num_input_tokens_seen': 24664303, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3112', 'grad_norm': '0.8934', 'learning_rate': '4.979e-05', 'epoch': '0.3034', 'num_input_tokens_seen': 24666350, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.486', 'grad_norm': '2.549', 'learning_rate': '4.979e-05', 'epoch': '0.3034', 'num_input_tokens_seen': 24668397, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3621', 'grad_norm': '0.8273', 'learning_rate': '4.979e-05', 'epoch': '0.3034', 'num_input_tokens_seen': 24670444, 'train_runtime': '1.248e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2976', 'grad_norm': '0.817', 'learning_rate': '4.979e-05', 'epoch': '0.3035', 'num_input_tokens_seen': 24672491, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8541', 'grad_norm': '1.246', 'learning_rate': '4.979e-05', 'epoch': '0.3035', 'num_input_tokens_seen': 24674538, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7859', 'grad_norm': '1.023', 'learning_rate': '4.979e-05', 'epoch': '0.3035', 'num_input_tokens_seen': 24676585, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6568', 'grad_norm': '1.535', 'learning_rate': '4.979e-05', 'epoch': '0.3035', 'num_input_tokens_seen': 24678632, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4792', 'grad_norm': '1.143', 'learning_rate': '4.979e-05', 'epoch': '0.3036', 'num_input_tokens_seen': 24680679, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7157', 'grad_norm': '1.289', 'learning_rate': '4.979e-05', 'epoch': '0.3036', 'num_input_tokens_seen': 24682726, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8857', 'grad_norm': '1.022', 'learning_rate': '4.979e-05', 'epoch': '0.3036', 'num_input_tokens_seen': 24684773, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3667', 'grad_norm': '1.002', 'learning_rate': '4.979e-05', 'epoch': '0.3036', 'num_input_tokens_seen': 24686820, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5612', 'grad_norm': '1.002', 'learning_rate': '4.979e-05', 'epoch': '0.3037', 'num_input_tokens_seen': 24688867, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.376', 'grad_norm': '2.163', 'learning_rate': '4.979e-05', 'epoch': '0.3037', 'num_input_tokens_seen': 24690914, 'train_runtime': '1.249e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.592', 'grad_norm': '1.401', 'learning_rate': '4.979e-05', 'epoch': '0.3037', 'num_input_tokens_seen': 24692961, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6733', 'grad_norm': '1.613', 'learning_rate': '4.979e-05', 'epoch': '0.3037', 'num_input_tokens_seen': 24695008, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2213', 'grad_norm': '0.7806', 'learning_rate': '4.979e-05', 'epoch': '0.3038', 'num_input_tokens_seen': 24697055, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4732', 'grad_norm': '1.213', 'learning_rate': '4.979e-05', 'epoch': '0.3038', 'num_input_tokens_seen': 24699102, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.53', 'grad_norm': '2.429', 'learning_rate': '4.979e-05', 'epoch': '0.3038', 'num_input_tokens_seen': 24701149, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8665', 'grad_norm': '1.248', 'learning_rate': '4.979e-05', 'epoch': '0.3038', 'num_input_tokens_seen': 24703196, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.912', 'grad_norm': '1.762', 'learning_rate': '4.979e-05', 'epoch': '0.3039', 'num_input_tokens_seen': 24705243, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.08', 'grad_norm': '2.061', 'learning_rate': '4.979e-05', 'epoch': '0.3039', 'num_input_tokens_seen': 24707290, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.193', 'grad_norm': '1.939', 'learning_rate': '4.979e-05', 'epoch': '0.3039', 'num_input_tokens_seen': 24709337, 'train_runtime': '1.25e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5285', 'grad_norm': '1.393', 'learning_rate': '4.979e-05', 'epoch': '0.304', 'num_input_tokens_seen': 24711384, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2047', 'grad_norm': '0.8573', 'learning_rate': '4.979e-05', 'epoch': '0.304', 'num_input_tokens_seen': 24713431, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.359', 'grad_norm': '1.895', 'learning_rate': '4.979e-05', 'epoch': '0.304', 'num_input_tokens_seen': 24715478, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2578', 'grad_norm': '0.9038', 'learning_rate': '4.979e-05', 'epoch': '0.304', 'num_input_tokens_seen': 24717525, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3239', 'grad_norm': '1.002', 'learning_rate': '4.979e-05', 'epoch': '0.3041', 'num_input_tokens_seen': 24719572, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2053', 'grad_norm': '0.8253', 'learning_rate': '4.979e-05', 'epoch': '0.3041', 'num_input_tokens_seen': 24721619, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.43', 'grad_norm': '0.8515', 'learning_rate': '4.979e-05', 'epoch': '0.3041', 'num_input_tokens_seen': 24723666, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1884', 'grad_norm': '0.8508', 'learning_rate': '4.979e-05', 'epoch': '0.3041', 'num_input_tokens_seen': 24725713, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.417', 'grad_norm': '0.9863', 'learning_rate': '4.979e-05', 'epoch': '0.3042', 'num_input_tokens_seen': 24727760, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2233', 'grad_norm': '0.8752', 'learning_rate': '4.979e-05', 'epoch': '0.3042', 'num_input_tokens_seen': 24729807, 'train_runtime': '1.251e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.195', 'grad_norm': '0.9556', 'learning_rate': '4.979e-05', 'epoch': '0.3042', 'num_input_tokens_seen': 24731854, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6558', 'grad_norm': '1.437', 'learning_rate': '4.979e-05', 'epoch': '0.3042', 'num_input_tokens_seen': 24733901, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.283', 'grad_norm': '2.143', 'learning_rate': '4.979e-05', 'epoch': '0.3043', 'num_input_tokens_seen': 24735948, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2017', 'grad_norm': '0.938', 'learning_rate': '4.979e-05', 'epoch': '0.3043', 'num_input_tokens_seen': 24737995, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.816', 'grad_norm': '2.622', 'learning_rate': '4.979e-05', 'epoch': '0.3043', 'num_input_tokens_seen': 24740042, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.637', 'grad_norm': '2.743', 'learning_rate': '4.979e-05', 'epoch': '0.3043', 'num_input_tokens_seen': 24742089, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7328', 'grad_norm': '1.109', 'learning_rate': '4.979e-05', 'epoch': '0.3044', 'num_input_tokens_seen': 24744136, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3128', 'grad_norm': '0.8131', 'learning_rate': '4.979e-05', 'epoch': '0.3044', 'num_input_tokens_seen': 24746183, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4506', 'grad_norm': '1.114', 'learning_rate': '4.979e-05', 'epoch': '0.3044', 'num_input_tokens_seen': 24748230, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2648', 'grad_norm': '0.8369', 'learning_rate': '4.979e-05', 'epoch': '0.3044', 'num_input_tokens_seen': 24750277, 'train_runtime': '1.252e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9383', 'grad_norm': '1.382', 'learning_rate': '4.979e-05', 'epoch': '0.3045', 'num_input_tokens_seen': 24752324, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3437', 'grad_norm': '0.9172', 'learning_rate': '4.979e-05', 'epoch': '0.3045', 'num_input_tokens_seen': 24754371, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4741', 'grad_norm': '1.469', 'learning_rate': '4.979e-05', 'epoch': '0.3045', 'num_input_tokens_seen': 24756418, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.623', 'grad_norm': '1.296', 'learning_rate': '4.979e-05', 'epoch': '0.3045', 'num_input_tokens_seen': 24758465, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.605', 'grad_norm': '2.62', 'learning_rate': '4.979e-05', 'epoch': '0.3046', 'num_input_tokens_seen': 24760512, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5656', 'grad_norm': '0.9667', 'learning_rate': '4.979e-05', 'epoch': '0.3046', 'num_input_tokens_seen': 24762559, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5465', 'grad_norm': '1.288', 'learning_rate': '4.979e-05', 'epoch': '0.3046', 'num_input_tokens_seen': 24764606, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3258', 'grad_norm': '0.7791', 'learning_rate': '4.979e-05', 'epoch': '0.3046', 'num_input_tokens_seen': 24766653, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.229', 'grad_norm': '1.546', 'learning_rate': '4.979e-05', 'epoch': '0.3047', 'num_input_tokens_seen': 24768700, 'train_runtime': '1.253e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '1.764', 'learning_rate': '4.979e-05', 'epoch': '0.3047', 'num_input_tokens_seen': 24770747, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.555', 'grad_norm': '1.094', 'learning_rate': '4.979e-05', 'epoch': '0.3047', 'num_input_tokens_seen': 24772794, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.073', 'grad_norm': '1.852', 'learning_rate': '4.979e-05', 'epoch': '0.3047', 'num_input_tokens_seen': 24774841, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3556', 'grad_norm': '1.028', 'learning_rate': '4.978e-05', 'epoch': '0.3048', 'num_input_tokens_seen': 24776888, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8283', 'grad_norm': '1.287', 'learning_rate': '4.978e-05', 'epoch': '0.3048', 'num_input_tokens_seen': 24778935, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8806', 'grad_norm': '1.302', 'learning_rate': '4.978e-05', 'epoch': '0.3048', 'num_input_tokens_seen': 24780982, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7563', 'grad_norm': '1.17', 'learning_rate': '4.978e-05', 'epoch': '0.3048', 'num_input_tokens_seen': 24783029, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3944', 'grad_norm': '1.164', 'learning_rate': '4.978e-05', 'epoch': '0.3049', 'num_input_tokens_seen': 24785076, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2866', 'grad_norm': '0.9066', 'learning_rate': '4.978e-05', 'epoch': '0.3049', 'num_input_tokens_seen': 24787123, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6089', 'grad_norm': '1.184', 'learning_rate': '4.978e-05', 'epoch': '0.3049', 'num_input_tokens_seen': 24789170, 'train_runtime': '1.254e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.131', 'grad_norm': '1.597', 'learning_rate': '4.978e-05', 'epoch': '0.3049', 'num_input_tokens_seen': 24791217, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.39', 'grad_norm': '2.295', 'learning_rate': '4.978e-05', 'epoch': '0.305', 'num_input_tokens_seen': 24793264, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.752', 'grad_norm': '1.335', 'learning_rate': '4.978e-05', 'epoch': '0.305', 'num_input_tokens_seen': 24795311, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2531', 'grad_norm': '0.8151', 'learning_rate': '4.978e-05', 'epoch': '0.305', 'num_input_tokens_seen': 24797358, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7306', 'grad_norm': '1.237', 'learning_rate': '4.978e-05', 'epoch': '0.305', 'num_input_tokens_seen': 24799405, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2705', 'grad_norm': '0.8703', 'learning_rate': '4.978e-05', 'epoch': '0.3051', 'num_input_tokens_seen': 24801452, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5275', 'grad_norm': '1.184', 'learning_rate': '4.978e-05', 'epoch': '0.3051', 'num_input_tokens_seen': 24803499, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6548', 'grad_norm': '1.347', 'learning_rate': '4.978e-05', 'epoch': '0.3051', 'num_input_tokens_seen': 24805546, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3378', 'grad_norm': '1.089', 'learning_rate': '4.978e-05', 'epoch': '0.3051', 'num_input_tokens_seen': 24807593, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6453', 'grad_norm': '1.226', 'learning_rate': '4.978e-05', 'epoch': '0.3052', 'num_input_tokens_seen': 24809640, 'train_runtime': '1.255e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5861', 'grad_norm': '1.298', 'learning_rate': '4.978e-05', 'epoch': '0.3052', 'num_input_tokens_seen': 24811687, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8351', 'grad_norm': '1.534', 'learning_rate': '4.978e-05', 'epoch': '0.3052', 'num_input_tokens_seen': 24813734, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2326', 'grad_norm': '0.7441', 'learning_rate': '4.978e-05', 'epoch': '0.3052', 'num_input_tokens_seen': 24815781, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7465', 'grad_norm': '1.024', 'learning_rate': '4.978e-05', 'epoch': '0.3053', 'num_input_tokens_seen': 24817828, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3232', 'grad_norm': '0.9859', 'learning_rate': '4.978e-05', 'epoch': '0.3053', 'num_input_tokens_seen': 24819875, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2695', 'grad_norm': '0.8888', 'learning_rate': '4.978e-05', 'epoch': '0.3053', 'num_input_tokens_seen': 24821922, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8305', 'grad_norm': '1.336', 'learning_rate': '4.978e-05', 'epoch': '0.3053', 'num_input_tokens_seen': 24823969, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7193', 'grad_norm': '1.187', 'learning_rate': '4.978e-05', 'epoch': '0.3054', 'num_input_tokens_seen': 24826016, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.472', 'grad_norm': '2.531', 'learning_rate': '4.978e-05', 'epoch': '0.3054', 'num_input_tokens_seen': 24828063, 'train_runtime': '1.256e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3206', 'grad_norm': '0.8842', 'learning_rate': '4.978e-05', 'epoch': '0.3054', 'num_input_tokens_seen': 24830110, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8763', 'grad_norm': '1.977', 'learning_rate': '4.978e-05', 'epoch': '0.3054', 'num_input_tokens_seen': 24832157, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5883', 'grad_norm': '1.281', 'learning_rate': '4.978e-05', 'epoch': '0.3055', 'num_input_tokens_seen': 24834204, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5444', 'grad_norm': '1.187', 'learning_rate': '4.978e-05', 'epoch': '0.3055', 'num_input_tokens_seen': 24836251, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8788', 'grad_norm': '1.442', 'learning_rate': '4.978e-05', 'epoch': '0.3055', 'num_input_tokens_seen': 24838298, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.001', 'grad_norm': '1.364', 'learning_rate': '4.978e-05', 'epoch': '0.3055', 'num_input_tokens_seen': 24840345, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2949', 'grad_norm': '0.8627', 'learning_rate': '4.978e-05', 'epoch': '0.3056', 'num_input_tokens_seen': 24842392, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.418', 'grad_norm': '1.86', 'learning_rate': '4.978e-05', 'epoch': '0.3056', 'num_input_tokens_seen': 24844439, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8502', 'grad_norm': '2.224', 'learning_rate': '4.978e-05', 'epoch': '0.3056', 'num_input_tokens_seen': 24846486, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2191', 'grad_norm': '0.7993', 'learning_rate': '4.978e-05', 'epoch': '0.3056', 'num_input_tokens_seen': 24848533, 'train_runtime': '1.257e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5421', 'grad_norm': '1.197', 'learning_rate': '4.978e-05', 'epoch': '0.3057', 'num_input_tokens_seen': 24850580, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5051', 'grad_norm': '1.122', 'learning_rate': '4.978e-05', 'epoch': '0.3057', 'num_input_tokens_seen': 24852627, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5902', 'grad_norm': '1.426', 'learning_rate': '4.978e-05', 'epoch': '0.3057', 'num_input_tokens_seen': 24854674, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8228', 'grad_norm': '2.438', 'learning_rate': '4.978e-05', 'epoch': '0.3057', 'num_input_tokens_seen': 24856721, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2861', 'grad_norm': '0.8157', 'learning_rate': '4.978e-05', 'epoch': '0.3058', 'num_input_tokens_seen': 24858768, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9408', 'grad_norm': '1.422', 'learning_rate': '4.978e-05', 'epoch': '0.3058', 'num_input_tokens_seen': 24860815, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6777', 'grad_norm': '1.368', 'learning_rate': '4.978e-05', 'epoch': '0.3058', 'num_input_tokens_seen': 24862862, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.041', 'grad_norm': '1.561', 'learning_rate': '4.978e-05', 'epoch': '0.3058', 'num_input_tokens_seen': 24864909, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8113', 'grad_norm': '2.035', 'learning_rate': '4.978e-05', 'epoch': '0.3059', 'num_input_tokens_seen': 24866956, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3197', 'grad_norm': '0.7596', 'learning_rate': '4.978e-05', 'epoch': '0.3059', 'num_input_tokens_seen': 24869003, 'train_runtime': '1.258e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6789', 'grad_norm': '1.629', 'learning_rate': '4.978e-05', 'epoch': '0.3059', 'num_input_tokens_seen': 24871050, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8121', 'grad_norm': '1.471', 'learning_rate': '4.978e-05', 'epoch': '0.3059', 'num_input_tokens_seen': 24873097, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6403', 'grad_norm': '1.184', 'learning_rate': '4.978e-05', 'epoch': '0.306', 'num_input_tokens_seen': 24875144, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8079', 'grad_norm': '1.789', 'learning_rate': '4.978e-05', 'epoch': '0.306', 'num_input_tokens_seen': 24877191, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8206', 'grad_norm': '1.498', 'learning_rate': '4.978e-05', 'epoch': '0.306', 'num_input_tokens_seen': 24879238, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.078', 'grad_norm': '1.849', 'learning_rate': '4.978e-05', 'epoch': '0.306', 'num_input_tokens_seen': 24881285, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3984', 'grad_norm': '1.207', 'learning_rate': '4.978e-05', 'epoch': '0.3061', 'num_input_tokens_seen': 24883332, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8765', 'grad_norm': '1.793', 'learning_rate': '4.978e-05', 'epoch': '0.3061', 'num_input_tokens_seen': 24885379, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3615', 'grad_norm': '0.7267', 'learning_rate': '4.978e-05', 'epoch': '0.3061', 'num_input_tokens_seen': 24887426, 'train_runtime': '1.259e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9865', 'grad_norm': '2.308', 'learning_rate': '4.978e-05', 'epoch': '0.3061', 'num_input_tokens_seen': 24889473, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.161', 'grad_norm': '1.901', 'learning_rate': '4.978e-05', 'epoch': '0.3062', 'num_input_tokens_seen': 24891520, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9304', 'grad_norm': '1.491', 'learning_rate': '4.978e-05', 'epoch': '0.3062', 'num_input_tokens_seen': 24893567, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4347', 'grad_norm': '1.121', 'learning_rate': '4.978e-05', 'epoch': '0.3062', 'num_input_tokens_seen': 24895614, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5529', 'grad_norm': '1.288', 'learning_rate': '4.978e-05', 'epoch': '0.3062', 'num_input_tokens_seen': 24897661, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.205', 'grad_norm': '1.653', 'learning_rate': '4.978e-05', 'epoch': '0.3063', 'num_input_tokens_seen': 24899708, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4182', 'grad_norm': '1.088', 'learning_rate': '4.978e-05', 'epoch': '0.3063', 'num_input_tokens_seen': 24901755, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3882', 'grad_norm': '1.02', 'learning_rate': '4.978e-05', 'epoch': '0.3063', 'num_input_tokens_seen': 24903802, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.286', 'grad_norm': '2.197', 'learning_rate': '4.978e-05', 'epoch': '0.3063', 'num_input_tokens_seen': 24905849, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.703', 'grad_norm': '1.059', 'learning_rate': '4.978e-05', 'epoch': '0.3064', 'num_input_tokens_seen': 24907896, 'train_runtime': '1.26e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.719', 'grad_norm': '3.18', 'learning_rate': '4.978e-05', 'epoch': '0.3064', 'num_input_tokens_seen': 24909943, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7469', 'grad_norm': '1.868', 'learning_rate': '4.978e-05', 'epoch': '0.3064', 'num_input_tokens_seen': 24911990, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2572', 'grad_norm': '0.7913', 'learning_rate': '4.978e-05', 'epoch': '0.3064', 'num_input_tokens_seen': 24914037, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7892', 'grad_norm': '1.376', 'learning_rate': '4.978e-05', 'epoch': '0.3065', 'num_input_tokens_seen': 24916084, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7558', 'grad_norm': '1.16', 'learning_rate': '4.978e-05', 'epoch': '0.3065', 'num_input_tokens_seen': 24918131, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5992', 'grad_norm': '1.158', 'learning_rate': '4.978e-05', 'epoch': '0.3065', 'num_input_tokens_seen': 24920178, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5924', 'grad_norm': '1.58', 'learning_rate': '4.978e-05', 'epoch': '0.3065', 'num_input_tokens_seen': 24922225, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7469', 'grad_norm': '1.574', 'learning_rate': '4.978e-05', 'epoch': '0.3066', 'num_input_tokens_seen': 24924272, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7409', 'grad_norm': '1.437', 'learning_rate': '4.978e-05', 'epoch': '0.3066', 'num_input_tokens_seen': 24926319, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5657', 'grad_norm': '1.34', 'learning_rate': '4.978e-05', 'epoch': '0.3066', 'num_input_tokens_seen': 24928366, 'train_runtime': '1.261e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4047', 'grad_norm': '1.107', 'learning_rate': '4.978e-05', 'epoch': '0.3066', 'num_input_tokens_seen': 24930413, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7613', 'grad_norm': '1.864', 'learning_rate': '4.978e-05', 'epoch': '0.3067', 'num_input_tokens_seen': 24932460, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7325', 'grad_norm': '1.31', 'learning_rate': '4.978e-05', 'epoch': '0.3067', 'num_input_tokens_seen': 24934507, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1809', 'grad_norm': '0.8693', 'learning_rate': '4.978e-05', 'epoch': '0.3067', 'num_input_tokens_seen': 24936554, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5646', 'grad_norm': '1.383', 'learning_rate': '4.978e-05', 'epoch': '0.3067', 'num_input_tokens_seen': 24938601, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6317', 'grad_norm': '1.238', 'learning_rate': '4.978e-05', 'epoch': '0.3068', 'num_input_tokens_seen': 24940648, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3697', 'grad_norm': '1.234', 'learning_rate': '4.978e-05', 'epoch': '0.3068', 'num_input_tokens_seen': 24942695, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.318', 'grad_norm': '0.8346', 'learning_rate': '4.978e-05', 'epoch': '0.3068', 'num_input_tokens_seen': 24944742, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7136', 'grad_norm': '1.004', 'learning_rate': '4.978e-05', 'epoch': '0.3068', 'num_input_tokens_seen': 24946789, 'train_runtime': '1.262e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.39', 'grad_norm': '3.056', 'learning_rate': '4.978e-05', 'epoch': '0.3069', 'num_input_tokens_seen': 24948836, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.72', 'grad_norm': '1.494', 'learning_rate': '4.978e-05', 'epoch': '0.3069', 'num_input_tokens_seen': 24950883, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.169', 'grad_norm': '1.55', 'learning_rate': '4.978e-05', 'epoch': '0.3069', 'num_input_tokens_seen': 24952930, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.282', 'grad_norm': '2.36', 'learning_rate': '4.978e-05', 'epoch': '0.3069', 'num_input_tokens_seen': 24954977, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3576', 'grad_norm': '0.9946', 'learning_rate': '4.978e-05', 'epoch': '0.307', 'num_input_tokens_seen': 24957024, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2827', 'grad_norm': '0.961', 'learning_rate': '4.978e-05', 'epoch': '0.307', 'num_input_tokens_seen': 24959071, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.308', 'grad_norm': '2.361', 'learning_rate': '4.978e-05', 'epoch': '0.307', 'num_input_tokens_seen': 24961118, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4894', 'grad_norm': '1.205', 'learning_rate': '4.978e-05', 'epoch': '0.307', 'num_input_tokens_seen': 24963165, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3084', 'grad_norm': '0.9413', 'learning_rate': '4.978e-05', 'epoch': '0.3071', 'num_input_tokens_seen': 24965212, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.395', 'grad_norm': '0.9808', 'learning_rate': '4.978e-05', 'epoch': '0.3071', 'num_input_tokens_seen': 24967259, 'train_runtime': '1.263e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5141', 'grad_norm': '1.002', 'learning_rate': '4.978e-05', 'epoch': '0.3071', 'num_input_tokens_seen': 24969306, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8881', 'grad_norm': '1.244', 'learning_rate': '4.978e-05', 'epoch': '0.3071', 'num_input_tokens_seen': 24971353, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5741', 'grad_norm': '1.113', 'learning_rate': '4.978e-05', 'epoch': '0.3072', 'num_input_tokens_seen': 24973400, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7211', 'grad_norm': '1.208', 'learning_rate': '4.978e-05', 'epoch': '0.3072', 'num_input_tokens_seen': 24975447, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9398', 'grad_norm': '2.023', 'learning_rate': '4.978e-05', 'epoch': '0.3072', 'num_input_tokens_seen': 24977494, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8763', 'grad_norm': '1.215', 'learning_rate': '4.978e-05', 'epoch': '0.3072', 'num_input_tokens_seen': 24979541, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4287', 'grad_norm': '0.9227', 'learning_rate': '4.978e-05', 'epoch': '0.3073', 'num_input_tokens_seen': 24981588, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.68', 'grad_norm': '4.331', 'learning_rate': '4.978e-05', 'epoch': '0.3073', 'num_input_tokens_seen': 24983635, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6123', 'grad_norm': '0.9458', 'learning_rate': '4.978e-05', 'epoch': '0.3073', 'num_input_tokens_seen': 24985682, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4677', 'grad_norm': '1.306', 'learning_rate': '4.978e-05', 'epoch': '0.3073', 'num_input_tokens_seen': 24987729, 'train_runtime': '1.264e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8463', 'grad_norm': '1.25', 'learning_rate': '4.978e-05', 'epoch': '0.3074', 'num_input_tokens_seen': 24989776, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9235', 'grad_norm': '1.789', 'learning_rate': '4.978e-05', 'epoch': '0.3074', 'num_input_tokens_seen': 24991823, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7492', 'grad_norm': '1.427', 'learning_rate': '4.978e-05', 'epoch': '0.3074', 'num_input_tokens_seen': 24993870, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3854', 'grad_norm': '1.113', 'learning_rate': '4.978e-05', 'epoch': '0.3075', 'num_input_tokens_seen': 24995917, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8846', 'grad_norm': '1.672', 'learning_rate': '4.978e-05', 'epoch': '0.3075', 'num_input_tokens_seen': 24997964, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7828', 'grad_norm': '1.704', 'learning_rate': '4.978e-05', 'epoch': '0.3075', 'num_input_tokens_seen': 25000011, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9189', 'grad_norm': '1.843', 'learning_rate': '4.978e-05', 'epoch': '0.3075', 'num_input_tokens_seen': 25002058, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3514', 'grad_norm': '0.9302', 'learning_rate': '4.978e-05', 'epoch': '0.3076', 'num_input_tokens_seen': 25004105, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5041', 'grad_norm': '1.244', 'learning_rate': '4.978e-05', 'epoch': '0.3076', 'num_input_tokens_seen': 25006152, 'train_runtime': '1.265e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5476', 'grad_norm': '1.382', 'learning_rate': '4.978e-05', 'epoch': '0.3076', 'num_input_tokens_seen': 25008199, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4929', 'grad_norm': '1.134', 'learning_rate': '4.978e-05', 'epoch': '0.3076', 'num_input_tokens_seen': 25010246, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.336', 'grad_norm': '0.9509', 'learning_rate': '4.978e-05', 'epoch': '0.3077', 'num_input_tokens_seen': 25012293, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.492', 'grad_norm': '1.336', 'learning_rate': '4.978e-05', 'epoch': '0.3077', 'num_input_tokens_seen': 25014340, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5825', 'grad_norm': '1.187', 'learning_rate': '4.978e-05', 'epoch': '0.3077', 'num_input_tokens_seen': 25016387, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.128', 'grad_norm': '2.117', 'learning_rate': '4.978e-05', 'epoch': '0.3077', 'num_input_tokens_seen': 25018434, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5702', 'grad_norm': '1.33', 'learning_rate': '4.978e-05', 'epoch': '0.3078', 'num_input_tokens_seen': 25020481, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3769', 'grad_norm': '1.061', 'learning_rate': '4.978e-05', 'epoch': '0.3078', 'num_input_tokens_seen': 25022528, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.673', 'grad_norm': '3.134', 'learning_rate': '4.978e-05', 'epoch': '0.3078', 'num_input_tokens_seen': 25024575, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5307', 'grad_norm': '0.9971', 'learning_rate': '4.978e-05', 'epoch': '0.3078', 'num_input_tokens_seen': 25026622, 'train_runtime': '1.266e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4301', 'grad_norm': '1.056', 'learning_rate': '4.978e-05', 'epoch': '0.3079', 'num_input_tokens_seen': 25028669, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6454', 'grad_norm': '1.912', 'learning_rate': '4.978e-05', 'epoch': '0.3079', 'num_input_tokens_seen': 25030716, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6649', 'grad_norm': '1.131', 'learning_rate': '4.978e-05', 'epoch': '0.3079', 'num_input_tokens_seen': 25032763, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3344', 'grad_norm': '1.024', 'learning_rate': '4.978e-05', 'epoch': '0.3079', 'num_input_tokens_seen': 25034810, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9196', 'grad_norm': '1.747', 'learning_rate': '4.978e-05', 'epoch': '0.308', 'num_input_tokens_seen': 25036857, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.354', 'grad_norm': '2.191', 'learning_rate': '4.978e-05', 'epoch': '0.308', 'num_input_tokens_seen': 25038904, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.006', 'grad_norm': '1.96', 'learning_rate': '4.978e-05', 'epoch': '0.308', 'num_input_tokens_seen': 25040951, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2209', 'grad_norm': '0.752', 'learning_rate': '4.978e-05', 'epoch': '0.308', 'num_input_tokens_seen': 25042998, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4939', 'grad_norm': '1.239', 'learning_rate': '4.978e-05', 'epoch': '0.3081', 'num_input_tokens_seen': 25045045, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3075', 'grad_norm': '0.8132', 'learning_rate': '4.978e-05', 'epoch': '0.3081', 'num_input_tokens_seen': 25047092, 'train_runtime': '1.267e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5628', 'grad_norm': '1.03', 'learning_rate': '4.978e-05', 'epoch': '0.3081', 'num_input_tokens_seen': 25049139, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.168', 'grad_norm': '1.376', 'learning_rate': '4.978e-05', 'epoch': '0.3081', 'num_input_tokens_seen': 25051186, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3852', 'grad_norm': '0.7972', 'learning_rate': '4.978e-05', 'epoch': '0.3082', 'num_input_tokens_seen': 25053233, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.697', 'grad_norm': '2.021', 'learning_rate': '4.978e-05', 'epoch': '0.3082', 'num_input_tokens_seen': 25055280, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6362', 'grad_norm': '1.508', 'learning_rate': '4.978e-05', 'epoch': '0.3082', 'num_input_tokens_seen': 25057327, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.153', 'grad_norm': '1.478', 'learning_rate': '4.978e-05', 'epoch': '0.3082', 'num_input_tokens_seen': 25059374, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3579', 'grad_norm': '1.06', 'learning_rate': '4.978e-05', 'epoch': '0.3083', 'num_input_tokens_seen': 25061421, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5149', 'grad_norm': '1.014', 'learning_rate': '4.978e-05', 'epoch': '0.3083', 'num_input_tokens_seen': 25063468, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3012', 'grad_norm': '0.8293', 'learning_rate': '4.978e-05', 'epoch': '0.3083', 'num_input_tokens_seen': 25065515, 'train_runtime': '1.268e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2724', 'grad_norm': '0.8075', 'learning_rate': '4.978e-05', 'epoch': '0.3083', 'num_input_tokens_seen': 25067562, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4151', 'grad_norm': '0.9507', 'learning_rate': '4.978e-05', 'epoch': '0.3084', 'num_input_tokens_seen': 25069609, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2739', 'grad_norm': '0.8548', 'learning_rate': '4.978e-05', 'epoch': '0.3084', 'num_input_tokens_seen': 25071656, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.29', 'grad_norm': '2.085', 'learning_rate': '4.978e-05', 'epoch': '0.3084', 'num_input_tokens_seen': 25073703, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3421', 'grad_norm': '1.04', 'learning_rate': '4.978e-05', 'epoch': '0.3084', 'num_input_tokens_seen': 25075750, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3616', 'grad_norm': '0.81', 'learning_rate': '4.978e-05', 'epoch': '0.3085', 'num_input_tokens_seen': 25077797, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.165', 'grad_norm': '2.259', 'learning_rate': '4.978e-05', 'epoch': '0.3085', 'num_input_tokens_seen': 25079844, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.276', 'grad_norm': '2.7', 'learning_rate': '4.978e-05', 'epoch': '0.3085', 'num_input_tokens_seen': 25081891, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2858', 'grad_norm': '0.8817', 'learning_rate': '4.978e-05', 'epoch': '0.3085', 'num_input_tokens_seen': 25083938, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5013', 'grad_norm': '1.272', 'learning_rate': '4.978e-05', 'epoch': '0.3086', 'num_input_tokens_seen': 25085985, 'train_runtime': '1.269e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5184', 'grad_norm': '1.189', 'learning_rate': '4.978e-05', 'epoch': '0.3086', 'num_input_tokens_seen': 25088032, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6859', 'grad_norm': '1.118', 'learning_rate': '4.978e-05', 'epoch': '0.3086', 'num_input_tokens_seen': 25090079, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2299', 'grad_norm': '0.8767', 'learning_rate': '4.978e-05', 'epoch': '0.3086', 'num_input_tokens_seen': 25092126, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1996', 'grad_norm': '0.8002', 'learning_rate': '4.978e-05', 'epoch': '0.3087', 'num_input_tokens_seen': 25094173, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7555', 'grad_norm': '1.271', 'learning_rate': '4.978e-05', 'epoch': '0.3087', 'num_input_tokens_seen': 25096220, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.74', 'grad_norm': '2.693', 'learning_rate': '4.978e-05', 'epoch': '0.3087', 'num_input_tokens_seen': 25098267, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2611', 'grad_norm': '0.8315', 'learning_rate': '4.978e-05', 'epoch': '0.3087', 'num_input_tokens_seen': 25100314, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4033', 'grad_norm': '1.002', 'learning_rate': '4.978e-05', 'epoch': '0.3088', 'num_input_tokens_seen': 25102361, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2692', 'grad_norm': '0.863', 'learning_rate': '4.978e-05', 'epoch': '0.3088', 'num_input_tokens_seen': 25104408, 'train_runtime': '1.27e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6433', 'grad_norm': '0.9859', 'learning_rate': '4.978e-05', 'epoch': '0.3088', 'num_input_tokens_seen': 25106455, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.573', 'grad_norm': '2.357', 'learning_rate': '4.978e-05', 'epoch': '0.3088', 'num_input_tokens_seen': 25108502, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.046', 'grad_norm': '2.65', 'learning_rate': '4.978e-05', 'epoch': '0.3089', 'num_input_tokens_seen': 25110549, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.363', 'grad_norm': '0.8568', 'learning_rate': '4.978e-05', 'epoch': '0.3089', 'num_input_tokens_seen': 25112596, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4885', 'grad_norm': '1.03', 'learning_rate': '4.978e-05', 'epoch': '0.3089', 'num_input_tokens_seen': 25114643, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6438', 'grad_norm': '1.386', 'learning_rate': '4.978e-05', 'epoch': '0.3089', 'num_input_tokens_seen': 25116690, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5092', 'grad_norm': '0.97', 'learning_rate': '4.978e-05', 'epoch': '0.309', 'num_input_tokens_seen': 25118737, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5522', 'grad_norm': '0.9965', 'learning_rate': '4.978e-05', 'epoch': '0.309', 'num_input_tokens_seen': 25120784, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.293', 'grad_norm': '2.231', 'learning_rate': '4.978e-05', 'epoch': '0.309', 'num_input_tokens_seen': 25122831, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3502', 'grad_norm': '1.103', 'learning_rate': '4.978e-05', 'epoch': '0.309', 'num_input_tokens_seen': 25124878, 'train_runtime': '1.271e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4325', 'grad_norm': '1.249', 'learning_rate': '4.978e-05', 'epoch': '0.3091', 'num_input_tokens_seen': 25126925, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3029', 'grad_norm': '0.9887', 'learning_rate': '4.978e-05', 'epoch': '0.3091', 'num_input_tokens_seen': 25128972, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2249', 'grad_norm': '0.928', 'learning_rate': '4.978e-05', 'epoch': '0.3091', 'num_input_tokens_seen': 25131019, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3367', 'grad_norm': '0.8266', 'learning_rate': '4.978e-05', 'epoch': '0.3091', 'num_input_tokens_seen': 25133066, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9032', 'grad_norm': '1.314', 'learning_rate': '4.978e-05', 'epoch': '0.3092', 'num_input_tokens_seen': 25135113, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.574', 'grad_norm': '2.283', 'learning_rate': '4.978e-05', 'epoch': '0.3092', 'num_input_tokens_seen': 25137160, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9995', 'grad_norm': '1.799', 'learning_rate': '4.978e-05', 'epoch': '0.3092', 'num_input_tokens_seen': 25139207, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4968', 'grad_norm': '1.273', 'learning_rate': '4.978e-05', 'epoch': '0.3092', 'num_input_tokens_seen': 25141254, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.501', 'grad_norm': '1.185', 'learning_rate': '4.978e-05', 'epoch': '0.3093', 'num_input_tokens_seen': 25143301, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.538', 'grad_norm': '2.19', 'learning_rate': '4.978e-05', 'epoch': '0.3093', 'num_input_tokens_seen': 25145348, 'train_runtime': '1.272e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6269', 'grad_norm': '1.228', 'learning_rate': '4.978e-05', 'epoch': '0.3093', 'num_input_tokens_seen': 25147395, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4623', 'grad_norm': '1.162', 'learning_rate': '4.978e-05', 'epoch': '0.3093', 'num_input_tokens_seen': 25149442, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7551', 'grad_norm': '1.23', 'learning_rate': '4.978e-05', 'epoch': '0.3094', 'num_input_tokens_seen': 25151489, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.254', 'grad_norm': '2.424', 'learning_rate': '4.978e-05', 'epoch': '0.3094', 'num_input_tokens_seen': 25153536, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4185', 'grad_norm': '1.285', 'learning_rate': '4.978e-05', 'epoch': '0.3094', 'num_input_tokens_seen': 25155583, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8043', 'grad_norm': '1.266', 'learning_rate': '4.978e-05', 'epoch': '0.3094', 'num_input_tokens_seen': 25157630, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4643', 'grad_norm': '1.227', 'learning_rate': '4.977e-05', 'epoch': '0.3095', 'num_input_tokens_seen': 25159677, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.58', 'grad_norm': '2.69', 'learning_rate': '4.977e-05', 'epoch': '0.3095', 'num_input_tokens_seen': 25161724, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8828', 'grad_norm': '1.924', 'learning_rate': '4.977e-05', 'epoch': '0.3095', 'num_input_tokens_seen': 25163771, 'train_runtime': '1.273e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2404', 'grad_norm': '0.877', 'learning_rate': '4.977e-05', 'epoch': '0.3095', 'num_input_tokens_seen': 25165818, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9283', 'grad_norm': '1.382', 'learning_rate': '4.977e-05', 'epoch': '0.3096', 'num_input_tokens_seen': 25167865, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5668', 'grad_norm': '1.137', 'learning_rate': '4.977e-05', 'epoch': '0.3096', 'num_input_tokens_seen': 25169912, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3001', 'grad_norm': '0.9064', 'learning_rate': '4.977e-05', 'epoch': '0.3096', 'num_input_tokens_seen': 25171959, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.33', 'grad_norm': '1', 'learning_rate': '4.977e-05', 'epoch': '0.3096', 'num_input_tokens_seen': 25174006, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5426', 'grad_norm': '0.7922', 'learning_rate': '4.977e-05', 'epoch': '0.3097', 'num_input_tokens_seen': 25176053, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.654', 'grad_norm': '2.946', 'learning_rate': '4.977e-05', 'epoch': '0.3097', 'num_input_tokens_seen': 25178100, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3162', 'grad_norm': '0.789', 'learning_rate': '4.977e-05', 'epoch': '0.3097', 'num_input_tokens_seen': 25180147, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5571', 'grad_norm': '1.315', 'learning_rate': '4.977e-05', 'epoch': '0.3097', 'num_input_tokens_seen': 25182194, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3479', 'grad_norm': '1.136', 'learning_rate': '4.977e-05', 'epoch': '0.3098', 'num_input_tokens_seen': 25184241, 'train_runtime': '1.274e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6475', 'grad_norm': '1.581', 'learning_rate': '4.977e-05', 'epoch': '0.3098', 'num_input_tokens_seen': 25186288, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.246', 'grad_norm': '1.881', 'learning_rate': '4.977e-05', 'epoch': '0.3098', 'num_input_tokens_seen': 25188335, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.491', 'grad_norm': '1.27', 'learning_rate': '4.977e-05', 'epoch': '0.3098', 'num_input_tokens_seen': 25190382, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.119', 'grad_norm': '1.794', 'learning_rate': '4.977e-05', 'epoch': '0.3099', 'num_input_tokens_seen': 25192429, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2611', 'grad_norm': '0.8304', 'learning_rate': '4.977e-05', 'epoch': '0.3099', 'num_input_tokens_seen': 25194476, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.593', 'grad_norm': '2.371', 'learning_rate': '4.977e-05', 'epoch': '0.3099', 'num_input_tokens_seen': 25196523, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8424', 'grad_norm': '1.62', 'learning_rate': '4.977e-05', 'epoch': '0.3099', 'num_input_tokens_seen': 25198570, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8204', 'grad_norm': '2.012', 'learning_rate': '4.977e-05', 'epoch': '0.31', 'num_input_tokens_seen': 25200617, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3012', 'grad_norm': '0.9093', 'learning_rate': '4.977e-05', 'epoch': '0.31', 'num_input_tokens_seen': 25202664, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6884', 'grad_norm': '1.619', 'learning_rate': '4.977e-05', 'epoch': '0.31', 'num_input_tokens_seen': 25204711, 'train_runtime': '1.275e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2725', 'grad_norm': '0.7879', 'learning_rate': '4.977e-05', 'epoch': '0.31', 'num_input_tokens_seen': 25206758, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.809', 'grad_norm': '2.826', 'learning_rate': '4.977e-05', 'epoch': '0.3101', 'num_input_tokens_seen': 25208805, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6805', 'grad_norm': '1.083', 'learning_rate': '4.977e-05', 'epoch': '0.3101', 'num_input_tokens_seen': 25210852, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9779', 'grad_norm': '2.036', 'learning_rate': '4.977e-05', 'epoch': '0.3101', 'num_input_tokens_seen': 25212899, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4639', 'grad_norm': '1.188', 'learning_rate': '4.977e-05', 'epoch': '0.3101', 'num_input_tokens_seen': 25214946, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8114', 'grad_norm': '1.704', 'learning_rate': '4.977e-05', 'epoch': '0.3102', 'num_input_tokens_seen': 25216993, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.358', 'grad_norm': '2.098', 'learning_rate': '4.977e-05', 'epoch': '0.3102', 'num_input_tokens_seen': 25219040, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6755', 'grad_norm': '1.582', 'learning_rate': '4.977e-05', 'epoch': '0.3102', 'num_input_tokens_seen': 25221087, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.706', 'grad_norm': '2.66', 'learning_rate': '4.977e-05', 'epoch': '0.3102', 'num_input_tokens_seen': 25223134, 'train_runtime': '1.276e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4727', 'grad_norm': '1.064', 'learning_rate': '4.977e-05', 'epoch': '0.3103', 'num_input_tokens_seen': 25225181, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9626', 'grad_norm': '1.422', 'learning_rate': '4.977e-05', 'epoch': '0.3103', 'num_input_tokens_seen': 25227228, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.611', 'grad_norm': '3.169', 'learning_rate': '4.977e-05', 'epoch': '0.3103', 'num_input_tokens_seen': 25229275, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.609', 'grad_norm': '2.523', 'learning_rate': '4.977e-05', 'epoch': '0.3103', 'num_input_tokens_seen': 25231322, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4775', 'grad_norm': '0.9039', 'learning_rate': '4.977e-05', 'epoch': '0.3104', 'num_input_tokens_seen': 25233369, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.729', 'grad_norm': '2.537', 'learning_rate': '4.977e-05', 'epoch': '0.3104', 'num_input_tokens_seen': 25235416, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6398', 'grad_norm': '1.555', 'learning_rate': '4.977e-05', 'epoch': '0.3104', 'num_input_tokens_seen': 25237463, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6727', 'grad_norm': '1.017', 'learning_rate': '4.977e-05', 'epoch': '0.3104', 'num_input_tokens_seen': 25239510, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2353', 'grad_norm': '0.8249', 'learning_rate': '4.977e-05', 'epoch': '0.3105', 'num_input_tokens_seen': 25241557, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.82', 'grad_norm': '1.549', 'learning_rate': '4.977e-05', 'epoch': '0.3105', 'num_input_tokens_seen': 25243604, 'train_runtime': '1.277e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4838', 'grad_norm': '1.143', 'learning_rate': '4.977e-05', 'epoch': '0.3105', 'num_input_tokens_seen': 25245651, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.548', 'grad_norm': '2.697', 'learning_rate': '4.977e-05', 'epoch': '0.3105', 'num_input_tokens_seen': 25247698, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7397', 'grad_norm': '1.277', 'learning_rate': '4.977e-05', 'epoch': '0.3106', 'num_input_tokens_seen': 25249745, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3148', 'grad_norm': '1.161', 'learning_rate': '4.977e-05', 'epoch': '0.3106', 'num_input_tokens_seen': 25251792, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7914', 'grad_norm': '1.3', 'learning_rate': '4.977e-05', 'epoch': '0.3106', 'num_input_tokens_seen': 25253839, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.432', 'grad_norm': '1.002', 'learning_rate': '4.977e-05', 'epoch': '0.3106', 'num_input_tokens_seen': 25255886, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9795', 'grad_norm': '2.658', 'learning_rate': '4.977e-05', 'epoch': '0.3107', 'num_input_tokens_seen': 25257933, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1679', 'grad_norm': '0.8322', 'learning_rate': '4.977e-05', 'epoch': '0.3107', 'num_input_tokens_seen': 25259980, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.639', 'grad_norm': '1.137', 'learning_rate': '4.977e-05', 'epoch': '0.3107', 'num_input_tokens_seen': 25262027, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9575', 'grad_norm': '1.614', 'learning_rate': '4.977e-05', 'epoch': '0.3107', 'num_input_tokens_seen': 25264074, 'train_runtime': '1.278e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2794', 'grad_norm': '0.8986', 'learning_rate': '4.977e-05', 'epoch': '0.3108', 'num_input_tokens_seen': 25266121, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2845', 'grad_norm': '0.7755', 'learning_rate': '4.977e-05', 'epoch': '0.3108', 'num_input_tokens_seen': 25268168, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.974', 'grad_norm': '1.867', 'learning_rate': '4.977e-05', 'epoch': '0.3108', 'num_input_tokens_seen': 25270215, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7743', 'grad_norm': '0.9601', 'learning_rate': '4.977e-05', 'epoch': '0.3108', 'num_input_tokens_seen': 25272262, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3415', 'grad_norm': '1.079', 'learning_rate': '4.977e-05', 'epoch': '0.3109', 'num_input_tokens_seen': 25274309, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9199', 'grad_norm': '1.846', 'learning_rate': '4.977e-05', 'epoch': '0.3109', 'num_input_tokens_seen': 25276356, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.025', 'grad_norm': '2.37', 'learning_rate': '4.977e-05', 'epoch': '0.3109', 'num_input_tokens_seen': 25278403, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.451', 'grad_norm': '1.125', 'learning_rate': '4.977e-05', 'epoch': '0.3109', 'num_input_tokens_seen': 25280450, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2687', 'grad_norm': '1.04', 'learning_rate': '4.977e-05', 'epoch': '0.311', 'num_input_tokens_seen': 25282497, 'train_runtime': '1.279e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2537', 'grad_norm': '0.8337', 'learning_rate': '4.977e-05', 'epoch': '0.311', 'num_input_tokens_seen': 25284544, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4287', 'grad_norm': '1.017', 'learning_rate': '4.977e-05', 'epoch': '0.311', 'num_input_tokens_seen': 25286591, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3644', 'grad_norm': '0.8531', 'learning_rate': '4.977e-05', 'epoch': '0.3111', 'num_input_tokens_seen': 25288638, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6668', 'grad_norm': '1.22', 'learning_rate': '4.977e-05', 'epoch': '0.3111', 'num_input_tokens_seen': 25290685, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6298', 'grad_norm': '1.293', 'learning_rate': '4.977e-05', 'epoch': '0.3111', 'num_input_tokens_seen': 25292732, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9849', 'grad_norm': '2.38', 'learning_rate': '4.977e-05', 'epoch': '0.3111', 'num_input_tokens_seen': 25294779, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.126', 'grad_norm': '1.903', 'learning_rate': '4.977e-05', 'epoch': '0.3112', 'num_input_tokens_seen': 25296826, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4297', 'grad_norm': '1.099', 'learning_rate': '4.977e-05', 'epoch': '0.3112', 'num_input_tokens_seen': 25298873, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6686', 'grad_norm': '0.9736', 'learning_rate': '4.977e-05', 'epoch': '0.3112', 'num_input_tokens_seen': 25300920, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5341', 'grad_norm': '1.228', 'learning_rate': '4.977e-05', 'epoch': '0.3112', 'num_input_tokens_seen': 25302967, 'train_runtime': '1.28e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.62', 'grad_norm': '2.734', 'learning_rate': '4.977e-05', 'epoch': '0.3113', 'num_input_tokens_seen': 25305014, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5696', 'grad_norm': '1.406', 'learning_rate': '4.977e-05', 'epoch': '0.3113', 'num_input_tokens_seen': 25307061, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8246', 'grad_norm': '1.549', 'learning_rate': '4.977e-05', 'epoch': '0.3113', 'num_input_tokens_seen': 25309108, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7369', 'grad_norm': '1.151', 'learning_rate': '4.977e-05', 'epoch': '0.3113', 'num_input_tokens_seen': 25311155, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.121', 'grad_norm': '1.92', 'learning_rate': '4.977e-05', 'epoch': '0.3114', 'num_input_tokens_seen': 25313202, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.531', 'grad_norm': '1.285', 'learning_rate': '4.977e-05', 'epoch': '0.3114', 'num_input_tokens_seen': 25315249, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.615', 'grad_norm': '2.226', 'learning_rate': '4.977e-05', 'epoch': '0.3114', 'num_input_tokens_seen': 25317296, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9056', 'grad_norm': '1.718', 'learning_rate': '4.977e-05', 'epoch': '0.3114', 'num_input_tokens_seen': 25319343, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.341', 'grad_norm': '0.9556', 'learning_rate': '4.977e-05', 'epoch': '0.3115', 'num_input_tokens_seen': 25321390, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7038', 'grad_norm': '1.145', 'learning_rate': '4.977e-05', 'epoch': '0.3115', 'num_input_tokens_seen': 25323437, 'train_runtime': '1.281e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3805', 'grad_norm': '0.8373', 'learning_rate': '4.977e-05', 'epoch': '0.3115', 'num_input_tokens_seen': 25325484, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2993', 'grad_norm': '0.8493', 'learning_rate': '4.977e-05', 'epoch': '0.3115', 'num_input_tokens_seen': 25327531, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3564', 'grad_norm': '0.8485', 'learning_rate': '4.977e-05', 'epoch': '0.3116', 'num_input_tokens_seen': 25329578, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.351', 'grad_norm': '2.262', 'learning_rate': '4.977e-05', 'epoch': '0.3116', 'num_input_tokens_seen': 25331625, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.128', 'grad_norm': '2.1', 'learning_rate': '4.977e-05', 'epoch': '0.3116', 'num_input_tokens_seen': 25333672, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.328', 'grad_norm': '1.944', 'learning_rate': '4.977e-05', 'epoch': '0.3116', 'num_input_tokens_seen': 25335719, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7606', 'grad_norm': '0.9784', 'learning_rate': '4.977e-05', 'epoch': '0.3117', 'num_input_tokens_seen': 25337766, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9457', 'grad_norm': '1.556', 'learning_rate': '4.977e-05', 'epoch': '0.3117', 'num_input_tokens_seen': 25339813, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3777', 'grad_norm': '1.014', 'learning_rate': '4.977e-05', 'epoch': '0.3117', 'num_input_tokens_seen': 25341860, 'train_runtime': '1.282e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7661', 'grad_norm': '1.073', 'learning_rate': '4.977e-05', 'epoch': '0.3117', 'num_input_tokens_seen': 25343907, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2195', 'grad_norm': '0.7379', 'learning_rate': '4.977e-05', 'epoch': '0.3118', 'num_input_tokens_seen': 25345954, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4655', 'grad_norm': '1.012', 'learning_rate': '4.977e-05', 'epoch': '0.3118', 'num_input_tokens_seen': 25348001, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.031', 'grad_norm': '1.533', 'learning_rate': '4.977e-05', 'epoch': '0.3118', 'num_input_tokens_seen': 25350048, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.023', 'grad_norm': '1.96', 'learning_rate': '4.977e-05', 'epoch': '0.3118', 'num_input_tokens_seen': 25352095, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.214', 'grad_norm': '0.9154', 'learning_rate': '4.977e-05', 'epoch': '0.3119', 'num_input_tokens_seen': 25354142, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4375', 'grad_norm': '0.9712', 'learning_rate': '4.977e-05', 'epoch': '0.3119', 'num_input_tokens_seen': 25356189, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.52', 'grad_norm': '2.181', 'learning_rate': '4.977e-05', 'epoch': '0.3119', 'num_input_tokens_seen': 25358236, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.628', 'grad_norm': '2.028', 'learning_rate': '4.977e-05', 'epoch': '0.3119', 'num_input_tokens_seen': 25360283, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.182', 'grad_norm': '1.857', 'learning_rate': '4.977e-05', 'epoch': '0.312', 'num_input_tokens_seen': 25362330, 'train_runtime': '1.283e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.825', 'grad_norm': '1.712', 'learning_rate': '4.977e-05', 'epoch': '0.312', 'num_input_tokens_seen': 25364377, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9616', 'grad_norm': '1.886', 'learning_rate': '4.977e-05', 'epoch': '0.312', 'num_input_tokens_seen': 25366424, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9731', 'grad_norm': '1.679', 'learning_rate': '4.977e-05', 'epoch': '0.312', 'num_input_tokens_seen': 25368471, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.463', 'grad_norm': '1.078', 'learning_rate': '4.977e-05', 'epoch': '0.3121', 'num_input_tokens_seen': 25370518, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.238', 'grad_norm': '2.337', 'learning_rate': '4.977e-05', 'epoch': '0.3121', 'num_input_tokens_seen': 25372565, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4322', 'grad_norm': '1.078', 'learning_rate': '4.977e-05', 'epoch': '0.3121', 'num_input_tokens_seen': 25374612, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2837', 'grad_norm': '0.962', 'learning_rate': '4.977e-05', 'epoch': '0.3121', 'num_input_tokens_seen': 25376659, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2563', 'grad_norm': '0.9468', 'learning_rate': '4.977e-05', 'epoch': '0.3122', 'num_input_tokens_seen': 25378706, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4513', 'grad_norm': '1.124', 'learning_rate': '4.977e-05', 'epoch': '0.3122', 'num_input_tokens_seen': 25380753, 'train_runtime': '1.284e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3077', 'grad_norm': '0.7661', 'learning_rate': '4.977e-05', 'epoch': '0.3122', 'num_input_tokens_seen': 25382800, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7071', 'grad_norm': '1.041', 'learning_rate': '4.977e-05', 'epoch': '0.3122', 'num_input_tokens_seen': 25384847, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9216', 'grad_norm': '1.631', 'learning_rate': '4.977e-05', 'epoch': '0.3123', 'num_input_tokens_seen': 25386894, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3641', 'grad_norm': '0.9319', 'learning_rate': '4.977e-05', 'epoch': '0.3123', 'num_input_tokens_seen': 25388941, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3555', 'grad_norm': '0.8957', 'learning_rate': '4.977e-05', 'epoch': '0.3123', 'num_input_tokens_seen': 25390988, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8301', 'grad_norm': '1.48', 'learning_rate': '4.977e-05', 'epoch': '0.3123', 'num_input_tokens_seen': 25393035, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.29', 'grad_norm': '2.781', 'learning_rate': '4.977e-05', 'epoch': '0.3124', 'num_input_tokens_seen': 25395082, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5778', 'grad_norm': '0.9465', 'learning_rate': '4.977e-05', 'epoch': '0.3124', 'num_input_tokens_seen': 25397129, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5492', 'grad_norm': '1.358', 'learning_rate': '4.977e-05', 'epoch': '0.3124', 'num_input_tokens_seen': 25399176, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.442', 'grad_norm': '1.93', 'learning_rate': '4.977e-05', 'epoch': '0.3124', 'num_input_tokens_seen': 25401223, 'train_runtime': '1.285e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.713', 'grad_norm': '2.469', 'learning_rate': '4.977e-05', 'epoch': '0.3125', 'num_input_tokens_seen': 25403270, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7119', 'grad_norm': '1.68', 'learning_rate': '4.977e-05', 'epoch': '0.3125', 'num_input_tokens_seen': 25405317, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8203', 'grad_norm': '1.517', 'learning_rate': '4.977e-05', 'epoch': '0.3125', 'num_input_tokens_seen': 25407364, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8305', 'grad_norm': '1.405', 'learning_rate': '4.977e-05', 'epoch': '0.3125', 'num_input_tokens_seen': 25409411, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5176', 'grad_norm': '1.057', 'learning_rate': '4.977e-05', 'epoch': '0.3126', 'num_input_tokens_seen': 25411458, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.824', 'grad_norm': '1.938', 'learning_rate': '4.977e-05', 'epoch': '0.3126', 'num_input_tokens_seen': 25413505, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.658', 'grad_norm': '1.253', 'learning_rate': '4.977e-05', 'epoch': '0.3126', 'num_input_tokens_seen': 25415552, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6748', 'grad_norm': '1.208', 'learning_rate': '4.977e-05', 'epoch': '0.3126', 'num_input_tokens_seen': 25417599, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3289', 'grad_norm': '0.9933', 'learning_rate': '4.977e-05', 'epoch': '0.3127', 'num_input_tokens_seen': 25419646, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7009', 'grad_norm': '1.33', 'learning_rate': '4.977e-05', 'epoch': '0.3127', 'num_input_tokens_seen': 25421693, 'train_runtime': '1.286e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.468', 'grad_norm': '2.315', 'learning_rate': '4.977e-05', 'epoch': '0.3127', 'num_input_tokens_seen': 25423740, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2919', 'grad_norm': '0.8636', 'learning_rate': '4.977e-05', 'epoch': '0.3127', 'num_input_tokens_seen': 25425787, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5247', 'grad_norm': '1.161', 'learning_rate': '4.977e-05', 'epoch': '0.3128', 'num_input_tokens_seen': 25427834, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6098', 'grad_norm': '1.169', 'learning_rate': '4.977e-05', 'epoch': '0.3128', 'num_input_tokens_seen': 25429881, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4081', 'grad_norm': '1.499', 'learning_rate': '4.977e-05', 'epoch': '0.3128', 'num_input_tokens_seen': 25431928, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9162', 'grad_norm': '1.286', 'learning_rate': '4.977e-05', 'epoch': '0.3128', 'num_input_tokens_seen': 25433975, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9833', 'grad_norm': '1.964', 'learning_rate': '4.977e-05', 'epoch': '0.3129', 'num_input_tokens_seen': 25436022, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7863', 'grad_norm': '1.63', 'learning_rate': '4.977e-05', 'epoch': '0.3129', 'num_input_tokens_seen': 25438069, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8497', 'grad_norm': '1.525', 'learning_rate': '4.977e-05', 'epoch': '0.3129', 'num_input_tokens_seen': 25440116, 'train_runtime': '1.287e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7676', 'grad_norm': '1.354', 'learning_rate': '4.977e-05', 'epoch': '0.3129', 'num_input_tokens_seen': 25442163, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9745', 'grad_norm': '1.703', 'learning_rate': '4.977e-05', 'epoch': '0.313', 'num_input_tokens_seen': 25444210, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9472', 'grad_norm': '1.261', 'learning_rate': '4.977e-05', 'epoch': '0.313', 'num_input_tokens_seen': 25446257, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3162', 'grad_norm': '0.9077', 'learning_rate': '4.977e-05', 'epoch': '0.313', 'num_input_tokens_seen': 25448304, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6955', 'grad_norm': '1.429', 'learning_rate': '4.977e-05', 'epoch': '0.313', 'num_input_tokens_seen': 25450351, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.825', 'grad_norm': '3.065', 'learning_rate': '4.977e-05', 'epoch': '0.3131', 'num_input_tokens_seen': 25452398, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.3', 'grad_norm': '2.637', 'learning_rate': '4.977e-05', 'epoch': '0.3131', 'num_input_tokens_seen': 25454445, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5985', 'grad_norm': '1.08', 'learning_rate': '4.977e-05', 'epoch': '0.3131', 'num_input_tokens_seen': 25456492, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8236', 'grad_norm': '1.285', 'learning_rate': '4.977e-05', 'epoch': '0.3131', 'num_input_tokens_seen': 25458539, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.819', 'grad_norm': '2.976', 'learning_rate': '4.977e-05', 'epoch': '0.3132', 'num_input_tokens_seen': 25460586, 'train_runtime': '1.288e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2379', 'grad_norm': '0.7924', 'learning_rate': '4.977e-05', 'epoch': '0.3132', 'num_input_tokens_seen': 25462633, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8652', 'grad_norm': '1.433', 'learning_rate': '4.977e-05', 'epoch': '0.3132', 'num_input_tokens_seen': 25464680, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3728', 'grad_norm': '0.9224', 'learning_rate': '4.977e-05', 'epoch': '0.3132', 'num_input_tokens_seen': 25466727, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7109', 'grad_norm': '1.202', 'learning_rate': '4.977e-05', 'epoch': '0.3133', 'num_input_tokens_seen': 25468774, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.286', 'grad_norm': '2.791', 'learning_rate': '4.977e-05', 'epoch': '0.3133', 'num_input_tokens_seen': 25470821, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6008', 'grad_norm': '0.9884', 'learning_rate': '4.977e-05', 'epoch': '0.3133', 'num_input_tokens_seen': 25472868, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4431', 'grad_norm': '0.868', 'learning_rate': '4.977e-05', 'epoch': '0.3133', 'num_input_tokens_seen': 25474915, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2307', 'grad_norm': '0.678', 'learning_rate': '4.977e-05', 'epoch': '0.3134', 'num_input_tokens_seen': 25476962, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2613', 'grad_norm': '0.8389', 'learning_rate': '4.977e-05', 'epoch': '0.3134', 'num_input_tokens_seen': 25479009, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.095', 'grad_norm': '1.559', 'learning_rate': '4.977e-05', 'epoch': '0.3134', 'num_input_tokens_seen': 25481056, 'train_runtime': '1.289e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6302', 'grad_norm': '1.221', 'learning_rate': '4.977e-05', 'epoch': '0.3134', 'num_input_tokens_seen': 25483103, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3796', 'grad_norm': '0.892', 'learning_rate': '4.977e-05', 'epoch': '0.3135', 'num_input_tokens_seen': 25485150, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.487', 'grad_norm': '1.078', 'learning_rate': '4.977e-05', 'epoch': '0.3135', 'num_input_tokens_seen': 25487197, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2124', 'grad_norm': '0.9524', 'learning_rate': '4.977e-05', 'epoch': '0.3135', 'num_input_tokens_seen': 25489244, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3348', 'grad_norm': '0.9113', 'learning_rate': '4.977e-05', 'epoch': '0.3135', 'num_input_tokens_seen': 25491291, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4997', 'grad_norm': '1.363', 'learning_rate': '4.977e-05', 'epoch': '0.3136', 'num_input_tokens_seen': 25493338, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8544', 'grad_norm': '1.34', 'learning_rate': '4.977e-05', 'epoch': '0.3136', 'num_input_tokens_seen': 25495385, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5334', 'grad_norm': '1.164', 'learning_rate': '4.977e-05', 'epoch': '0.3136', 'num_input_tokens_seen': 25497432, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4216', 'grad_norm': '1.202', 'learning_rate': '4.977e-05', 'epoch': '0.3136', 'num_input_tokens_seen': 25499479, 'train_runtime': '1.29e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7339', 'grad_norm': '1.36', 'learning_rate': '4.977e-05', 'epoch': '0.3137', 'num_input_tokens_seen': 25501526, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.075', 'grad_norm': '1.714', 'learning_rate': '4.977e-05', 'epoch': '0.3137', 'num_input_tokens_seen': 25503573, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7834', 'grad_norm': '1.342', 'learning_rate': '4.977e-05', 'epoch': '0.3137', 'num_input_tokens_seen': 25505620, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1842', 'grad_norm': '0.8206', 'learning_rate': '4.977e-05', 'epoch': '0.3137', 'num_input_tokens_seen': 25507667, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2647', 'grad_norm': '0.9718', 'learning_rate': '4.977e-05', 'epoch': '0.3138', 'num_input_tokens_seen': 25509714, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7925', 'grad_norm': '1.329', 'learning_rate': '4.977e-05', 'epoch': '0.3138', 'num_input_tokens_seen': 25511761, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.639', 'grad_norm': '1.365', 'learning_rate': '4.977e-05', 'epoch': '0.3138', 'num_input_tokens_seen': 25513808, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2924', 'grad_norm': '0.776', 'learning_rate': '4.977e-05', 'epoch': '0.3138', 'num_input_tokens_seen': 25515855, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3346', 'grad_norm': '1.021', 'learning_rate': '4.977e-05', 'epoch': '0.3139', 'num_input_tokens_seen': 25517902, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3216', 'grad_norm': '0.9508', 'learning_rate': '4.977e-05', 'epoch': '0.3139', 'num_input_tokens_seen': 25519949, 'train_runtime': '1.291e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6218', 'grad_norm': '1.538', 'learning_rate': '4.977e-05', 'epoch': '0.3139', 'num_input_tokens_seen': 25521996, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.114', 'grad_norm': '2.185', 'learning_rate': '4.977e-05', 'epoch': '0.3139', 'num_input_tokens_seen': 25524043, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.381', 'grad_norm': '2.279', 'learning_rate': '4.977e-05', 'epoch': '0.314', 'num_input_tokens_seen': 25526090, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2884', 'grad_norm': '1.18', 'learning_rate': '4.977e-05', 'epoch': '0.314', 'num_input_tokens_seen': 25528137, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3183', 'grad_norm': '1.004', 'learning_rate': '4.977e-05', 'epoch': '0.314', 'num_input_tokens_seen': 25530184, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.321', 'grad_norm': '2.461', 'learning_rate': '4.977e-05', 'epoch': '0.314', 'num_input_tokens_seen': 25532231, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.673', 'grad_norm': '1.005', 'learning_rate': '4.976e-05', 'epoch': '0.3141', 'num_input_tokens_seen': 25534278, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2207', 'grad_norm': '0.8431', 'learning_rate': '4.976e-05', 'epoch': '0.3141', 'num_input_tokens_seen': 25536325, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.576', 'grad_norm': '1.965', 'learning_rate': '4.976e-05', 'epoch': '0.3141', 'num_input_tokens_seen': 25538372, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.142', 'grad_norm': '2.065', 'learning_rate': '4.976e-05', 'epoch': '0.3141', 'num_input_tokens_seen': 25540419, 'train_runtime': '1.292e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3608', 'grad_norm': '0.8469', 'learning_rate': '4.976e-05', 'epoch': '0.3142', 'num_input_tokens_seen': 25542466, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5202', 'grad_norm': '1.076', 'learning_rate': '4.976e-05', 'epoch': '0.3142', 'num_input_tokens_seen': 25544513, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3501', 'grad_norm': '0.9608', 'learning_rate': '4.976e-05', 'epoch': '0.3142', 'num_input_tokens_seen': 25546560, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2301', 'grad_norm': '0.742', 'learning_rate': '4.976e-05', 'epoch': '0.3142', 'num_input_tokens_seen': 25548607, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.413', 'grad_norm': '2.432', 'learning_rate': '4.976e-05', 'epoch': '0.3143', 'num_input_tokens_seen': 25550654, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3157', 'grad_norm': '0.8726', 'learning_rate': '4.976e-05', 'epoch': '0.3143', 'num_input_tokens_seen': 25552701, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5148', 'grad_norm': '1.363', 'learning_rate': '4.976e-05', 'epoch': '0.3143', 'num_input_tokens_seen': 25554748, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5287', 'grad_norm': '1.356', 'learning_rate': '4.976e-05', 'epoch': '0.3143', 'num_input_tokens_seen': 25556795, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4588', 'grad_norm': '1.168', 'learning_rate': '4.976e-05', 'epoch': '0.3144', 'num_input_tokens_seen': 25558842, 'train_runtime': '1.293e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1932', 'grad_norm': '0.7976', 'learning_rate': '4.976e-05', 'epoch': '0.3144', 'num_input_tokens_seen': 25560889, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1895', 'grad_norm': '0.7478', 'learning_rate': '4.976e-05', 'epoch': '0.3144', 'num_input_tokens_seen': 25562936, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9669', 'grad_norm': '2.358', 'learning_rate': '4.976e-05', 'epoch': '0.3144', 'num_input_tokens_seen': 25564983, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5264', 'grad_norm': '1.146', 'learning_rate': '4.976e-05', 'epoch': '0.3145', 'num_input_tokens_seen': 25567030, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.01', 'grad_norm': '1.892', 'learning_rate': '4.976e-05', 'epoch': '0.3145', 'num_input_tokens_seen': 25569077, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3391', 'grad_norm': '1.037', 'learning_rate': '4.976e-05', 'epoch': '0.3145', 'num_input_tokens_seen': 25571124, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5049', 'grad_norm': '1.257', 'learning_rate': '4.976e-05', 'epoch': '0.3146', 'num_input_tokens_seen': 25573171, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4139', 'grad_norm': '0.9136', 'learning_rate': '4.976e-05', 'epoch': '0.3146', 'num_input_tokens_seen': 25575218, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2543', 'grad_norm': '0.7469', 'learning_rate': '4.976e-05', 'epoch': '0.3146', 'num_input_tokens_seen': 25577265, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.037', 'grad_norm': '1.506', 'learning_rate': '4.976e-05', 'epoch': '0.3146', 'num_input_tokens_seen': 25579312, 'train_runtime': '1.294e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4428', 'grad_norm': '1.009', 'learning_rate': '4.976e-05', 'epoch': '0.3147', 'num_input_tokens_seen': 25581359, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6937', 'grad_norm': '1.642', 'learning_rate': '4.976e-05', 'epoch': '0.3147', 'num_input_tokens_seen': 25583406, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9332', 'grad_norm': '1.292', 'learning_rate': '4.976e-05', 'epoch': '0.3147', 'num_input_tokens_seen': 25585453, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6121', 'grad_norm': '1.017', 'learning_rate': '4.976e-05', 'epoch': '0.3147', 'num_input_tokens_seen': 25587500, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5079', 'grad_norm': '1.181', 'learning_rate': '4.976e-05', 'epoch': '0.3148', 'num_input_tokens_seen': 25589547, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.001', 'grad_norm': '2.069', 'learning_rate': '4.976e-05', 'epoch': '0.3148', 'num_input_tokens_seen': 25591594, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5358', 'grad_norm': '1.204', 'learning_rate': '4.976e-05', 'epoch': '0.3148', 'num_input_tokens_seen': 25593641, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.254', 'grad_norm': '0.737', 'learning_rate': '4.976e-05', 'epoch': '0.3148', 'num_input_tokens_seen': 25595688, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3327', 'grad_norm': '0.836', 'learning_rate': '4.976e-05', 'epoch': '0.3149', 'num_input_tokens_seen': 25597735, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3499', 'grad_norm': '1.45', 'learning_rate': '4.976e-05', 'epoch': '0.3149', 'num_input_tokens_seen': 25599782, 'train_runtime': '1.295e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7522', 'grad_norm': '1.726', 'learning_rate': '4.976e-05', 'epoch': '0.3149', 'num_input_tokens_seen': 25601829, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2949', 'grad_norm': '0.8389', 'learning_rate': '4.976e-05', 'epoch': '0.3149', 'num_input_tokens_seen': 25603876, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6892', 'grad_norm': '1.105', 'learning_rate': '4.976e-05', 'epoch': '0.315', 'num_input_tokens_seen': 25605923, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7931', 'grad_norm': '1.864', 'learning_rate': '4.976e-05', 'epoch': '0.315', 'num_input_tokens_seen': 25607970, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4928', 'grad_norm': '1.04', 'learning_rate': '4.976e-05', 'epoch': '0.315', 'num_input_tokens_seen': 25610017, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.213', 'grad_norm': '2.022', 'learning_rate': '4.976e-05', 'epoch': '0.315', 'num_input_tokens_seen': 25612064, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6095', 'grad_norm': '1.044', 'learning_rate': '4.976e-05', 'epoch': '0.3151', 'num_input_tokens_seen': 25614111, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4184', 'grad_norm': '0.9974', 'learning_rate': '4.976e-05', 'epoch': '0.3151', 'num_input_tokens_seen': 25616158, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2908', 'grad_norm': '0.7654', 'learning_rate': '4.976e-05', 'epoch': '0.3151', 'num_input_tokens_seen': 25618205, 'train_runtime': '1.296e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.943', 'grad_norm': '1.388', 'learning_rate': '4.976e-05', 'epoch': '0.3151', 'num_input_tokens_seen': 25620252, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5222', 'grad_norm': '1.334', 'learning_rate': '4.976e-05', 'epoch': '0.3152', 'num_input_tokens_seen': 25622299, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4636', 'grad_norm': '1.149', 'learning_rate': '4.976e-05', 'epoch': '0.3152', 'num_input_tokens_seen': 25624346, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3955', 'grad_norm': '0.9493', 'learning_rate': '4.976e-05', 'epoch': '0.3152', 'num_input_tokens_seen': 25626393, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5337', 'grad_norm': '0.9599', 'learning_rate': '4.976e-05', 'epoch': '0.3152', 'num_input_tokens_seen': 25628440, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4404', 'grad_norm': '1.24', 'learning_rate': '4.976e-05', 'epoch': '0.3153', 'num_input_tokens_seen': 25630487, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5858', 'grad_norm': '1.822', 'learning_rate': '4.976e-05', 'epoch': '0.3153', 'num_input_tokens_seen': 25632534, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9959', 'grad_norm': '2.362', 'learning_rate': '4.976e-05', 'epoch': '0.3153', 'num_input_tokens_seen': 25634581, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2731', 'grad_norm': '0.9441', 'learning_rate': '4.976e-05', 'epoch': '0.3153', 'num_input_tokens_seen': 25636628, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6533', 'grad_norm': '1.236', 'learning_rate': '4.976e-05', 'epoch': '0.3154', 'num_input_tokens_seen': 25638675, 'train_runtime': '1.297e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8461', 'grad_norm': '2.041', 'learning_rate': '4.976e-05', 'epoch': '0.3154', 'num_input_tokens_seen': 25640722, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.347', 'grad_norm': '2.287', 'learning_rate': '4.976e-05', 'epoch': '0.3154', 'num_input_tokens_seen': 25642769, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6179', 'grad_norm': '1.206', 'learning_rate': '4.976e-05', 'epoch': '0.3154', 'num_input_tokens_seen': 25644816, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4356', 'grad_norm': '1.169', 'learning_rate': '4.976e-05', 'epoch': '0.3155', 'num_input_tokens_seen': 25646863, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7297', 'grad_norm': '1.196', 'learning_rate': '4.976e-05', 'epoch': '0.3155', 'num_input_tokens_seen': 25648910, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8764', 'grad_norm': '1.633', 'learning_rate': '4.976e-05', 'epoch': '0.3155', 'num_input_tokens_seen': 25650957, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6326', 'grad_norm': '1.263', 'learning_rate': '4.976e-05', 'epoch': '0.3155', 'num_input_tokens_seen': 25653004, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6191', 'grad_norm': '1.394', 'learning_rate': '4.976e-05', 'epoch': '0.3156', 'num_input_tokens_seen': 25655051, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5225', 'grad_norm': '1.088', 'learning_rate': '4.976e-05', 'epoch': '0.3156', 'num_input_tokens_seen': 25657098, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1986', 'grad_norm': '0.7529', 'learning_rate': '4.976e-05', 'epoch': '0.3156', 'num_input_tokens_seen': 25659145, 'train_runtime': '1.298e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.01', 'grad_norm': '2.204', 'learning_rate': '4.976e-05', 'epoch': '0.3156', 'num_input_tokens_seen': 25661192, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8665', 'grad_norm': '1.441', 'learning_rate': '4.976e-05', 'epoch': '0.3157', 'num_input_tokens_seen': 25663239, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.695', 'grad_norm': '2.443', 'learning_rate': '4.976e-05', 'epoch': '0.3157', 'num_input_tokens_seen': 25665286, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.401', 'grad_norm': '1.155', 'learning_rate': '4.976e-05', 'epoch': '0.3157', 'num_input_tokens_seen': 25667333, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3672', 'grad_norm': '1.014', 'learning_rate': '4.976e-05', 'epoch': '0.3157', 'num_input_tokens_seen': 25669380, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7626', 'grad_norm': '1.473', 'learning_rate': '4.976e-05', 'epoch': '0.3158', 'num_input_tokens_seen': 25671427, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9022', 'grad_norm': '1.667', 'learning_rate': '4.976e-05', 'epoch': '0.3158', 'num_input_tokens_seen': 25673474, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.203', 'grad_norm': '0.802', 'learning_rate': '4.976e-05', 'epoch': '0.3158', 'num_input_tokens_seen': 25675521, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.319', 'grad_norm': '0.9413', 'learning_rate': '4.976e-05', 'epoch': '0.3158', 'num_input_tokens_seen': 25677568, 'train_runtime': '1.299e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.453', 'grad_norm': '2.524', 'learning_rate': '4.976e-05', 'epoch': '0.3159', 'num_input_tokens_seen': 25679615, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7832', 'grad_norm': '1.975', 'learning_rate': '4.976e-05', 'epoch': '0.3159', 'num_input_tokens_seen': 25681662, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.008', 'grad_norm': '2.056', 'learning_rate': '4.976e-05', 'epoch': '0.3159', 'num_input_tokens_seen': 25683709, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.343', 'grad_norm': '0.9142', 'learning_rate': '4.976e-05', 'epoch': '0.3159', 'num_input_tokens_seen': 25685756, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4549', 'grad_norm': '1.131', 'learning_rate': '4.976e-05', 'epoch': '0.316', 'num_input_tokens_seen': 25687803, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6718', 'grad_norm': '1.552', 'learning_rate': '4.976e-05', 'epoch': '0.316', 'num_input_tokens_seen': 25689850, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7782', 'grad_norm': '1.231', 'learning_rate': '4.976e-05', 'epoch': '0.316', 'num_input_tokens_seen': 25691897, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5176', 'grad_norm': '1.022', 'learning_rate': '4.976e-05', 'epoch': '0.316', 'num_input_tokens_seen': 25693944, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8572', 'grad_norm': '1.059', 'learning_rate': '4.976e-05', 'epoch': '0.3161', 'num_input_tokens_seen': 25695991, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8162', 'grad_norm': '1.458', 'learning_rate': '4.976e-05', 'epoch': '0.3161', 'num_input_tokens_seen': 25698038, 'train_runtime': '1.3e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2034', 'grad_norm': '0.9326', 'learning_rate': '4.976e-05', 'epoch': '0.3161', 'num_input_tokens_seen': 25700085, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.206', 'grad_norm': '1.398', 'learning_rate': '4.976e-05', 'epoch': '0.3161', 'num_input_tokens_seen': 25702132, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8465', 'grad_norm': '1.415', 'learning_rate': '4.976e-05', 'epoch': '0.3162', 'num_input_tokens_seen': 25704179, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6593', 'grad_norm': '1.414', 'learning_rate': '4.976e-05', 'epoch': '0.3162', 'num_input_tokens_seen': 25706226, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.338', 'grad_norm': '2.107', 'learning_rate': '4.976e-05', 'epoch': '0.3162', 'num_input_tokens_seen': 25708273, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9792', 'grad_norm': '1.84', 'learning_rate': '4.976e-05', 'epoch': '0.3162', 'num_input_tokens_seen': 25710320, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.986', 'grad_norm': '1.688', 'learning_rate': '4.976e-05', 'epoch': '0.3163', 'num_input_tokens_seen': 25712367, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.871', 'grad_norm': '1.253', 'learning_rate': '4.976e-05', 'epoch': '0.3163', 'num_input_tokens_seen': 25714414, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.73', 'grad_norm': '2.526', 'learning_rate': '4.976e-05', 'epoch': '0.3163', 'num_input_tokens_seen': 25716461, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7526', 'grad_norm': '1.088', 'learning_rate': '4.976e-05', 'epoch': '0.3163', 'num_input_tokens_seen': 25718508, 'train_runtime': '1.301e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5201', 'grad_norm': '1.353', 'learning_rate': '4.976e-05', 'epoch': '0.3164', 'num_input_tokens_seen': 25720555, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7543', 'grad_norm': '1.423', 'learning_rate': '4.976e-05', 'epoch': '0.3164', 'num_input_tokens_seen': 25722602, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.54', 'grad_norm': '2.459', 'learning_rate': '4.976e-05', 'epoch': '0.3164', 'num_input_tokens_seen': 25724649, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.109', 'grad_norm': '2.323', 'learning_rate': '4.976e-05', 'epoch': '0.3164', 'num_input_tokens_seen': 25726696, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.06', 'grad_norm': '2.133', 'learning_rate': '4.976e-05', 'epoch': '0.3165', 'num_input_tokens_seen': 25728743, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.136', 'grad_norm': '1.79', 'learning_rate': '4.976e-05', 'epoch': '0.3165', 'num_input_tokens_seen': 25730790, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7586', 'grad_norm': '1.291', 'learning_rate': '4.976e-05', 'epoch': '0.3165', 'num_input_tokens_seen': 25732837, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3692', 'grad_norm': '0.9824', 'learning_rate': '4.976e-05', 'epoch': '0.3165', 'num_input_tokens_seen': 25734884, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2078', 'grad_norm': '0.7841', 'learning_rate': '4.976e-05', 'epoch': '0.3166', 'num_input_tokens_seen': 25736931, 'train_runtime': '1.302e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8764', 'grad_norm': '1.241', 'learning_rate': '4.976e-05', 'epoch': '0.3166', 'num_input_tokens_seen': 25738978, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4912', 'grad_norm': '1.315', 'learning_rate': '4.976e-05', 'epoch': '0.3166', 'num_input_tokens_seen': 25741025, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5372', 'grad_norm': '1.25', 'learning_rate': '4.976e-05', 'epoch': '0.3166', 'num_input_tokens_seen': 25743072, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3097', 'grad_norm': '0.7806', 'learning_rate': '4.976e-05', 'epoch': '0.3167', 'num_input_tokens_seen': 25745119, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4891', 'grad_norm': '0.9932', 'learning_rate': '4.976e-05', 'epoch': '0.3167', 'num_input_tokens_seen': 25747166, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2701', 'grad_norm': '0.84', 'learning_rate': '4.976e-05', 'epoch': '0.3167', 'num_input_tokens_seen': 25749213, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5491', 'grad_norm': '1.146', 'learning_rate': '4.976e-05', 'epoch': '0.3167', 'num_input_tokens_seen': 25751260, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2072', 'grad_norm': '0.7075', 'learning_rate': '4.976e-05', 'epoch': '0.3168', 'num_input_tokens_seen': 25753307, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8404', 'grad_norm': '1.232', 'learning_rate': '4.976e-05', 'epoch': '0.3168', 'num_input_tokens_seen': 25755354, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5966', 'grad_norm': '1.211', 'learning_rate': '4.976e-05', 'epoch': '0.3168', 'num_input_tokens_seen': 25757401, 'train_runtime': '1.303e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.494', 'grad_norm': '2.604', 'learning_rate': '4.976e-05', 'epoch': '0.3168', 'num_input_tokens_seen': 25759448, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.225', 'grad_norm': '2.188', 'learning_rate': '4.976e-05', 'epoch': '0.3169', 'num_input_tokens_seen': 25761495, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.163', 'grad_norm': '2.144', 'learning_rate': '4.976e-05', 'epoch': '0.3169', 'num_input_tokens_seen': 25763542, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.139', 'grad_norm': '1.978', 'learning_rate': '4.976e-05', 'epoch': '0.3169', 'num_input_tokens_seen': 25765589, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7719', 'grad_norm': '1.106', 'learning_rate': '4.976e-05', 'epoch': '0.3169', 'num_input_tokens_seen': 25767636, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4783', 'grad_norm': '1.209', 'learning_rate': '4.976e-05', 'epoch': '0.317', 'num_input_tokens_seen': 25769683, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7308', 'grad_norm': '1.949', 'learning_rate': '4.976e-05', 'epoch': '0.317', 'num_input_tokens_seen': 25771730, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.291', 'grad_norm': '2.007', 'learning_rate': '4.976e-05', 'epoch': '0.317', 'num_input_tokens_seen': 25773777, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.128', 'grad_norm': '2.923', 'learning_rate': '4.976e-05', 'epoch': '0.317', 'num_input_tokens_seen': 25775824, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.336', 'grad_norm': '2.594', 'learning_rate': '4.976e-05', 'epoch': '0.3171', 'num_input_tokens_seen': 25777871, 'train_runtime': '1.304e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3362', 'grad_norm': '0.9731', 'learning_rate': '4.976e-05', 'epoch': '0.3171', 'num_input_tokens_seen': 25779918, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7764', 'grad_norm': '1.83', 'learning_rate': '4.976e-05', 'epoch': '0.3171', 'num_input_tokens_seen': 25781965, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3975', 'grad_norm': '0.9875', 'learning_rate': '4.976e-05', 'epoch': '0.3171', 'num_input_tokens_seen': 25784012, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3194', 'grad_norm': '1.209', 'learning_rate': '4.976e-05', 'epoch': '0.3172', 'num_input_tokens_seen': 25786059, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3381', 'grad_norm': '0.9895', 'learning_rate': '4.976e-05', 'epoch': '0.3172', 'num_input_tokens_seen': 25788106, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2727', 'grad_norm': '0.9487', 'learning_rate': '4.976e-05', 'epoch': '0.3172', 'num_input_tokens_seen': 25790153, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2732', 'grad_norm': '0.993', 'learning_rate': '4.976e-05', 'epoch': '0.3172', 'num_input_tokens_seen': 25792200, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6776', 'grad_norm': '1.535', 'learning_rate': '4.976e-05', 'epoch': '0.3173', 'num_input_tokens_seen': 25794247, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5843', 'grad_norm': '1.18', 'learning_rate': '4.976e-05', 'epoch': '0.3173', 'num_input_tokens_seen': 25796294, 'train_runtime': '1.305e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.261', 'grad_norm': '2.318', 'learning_rate': '4.976e-05', 'epoch': '0.3173', 'num_input_tokens_seen': 25798341, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4932', 'grad_norm': '1.246', 'learning_rate': '4.976e-05', 'epoch': '0.3173', 'num_input_tokens_seen': 25800388, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.26', 'grad_norm': '2.862', 'learning_rate': '4.976e-05', 'epoch': '0.3174', 'num_input_tokens_seen': 25802435, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6224', 'grad_norm': '1.486', 'learning_rate': '4.976e-05', 'epoch': '0.3174', 'num_input_tokens_seen': 25804482, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6165', 'grad_norm': '1.415', 'learning_rate': '4.976e-05', 'epoch': '0.3174', 'num_input_tokens_seen': 25806529, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5211', 'grad_norm': '1.282', 'learning_rate': '4.976e-05', 'epoch': '0.3174', 'num_input_tokens_seen': 25808576, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5519', 'grad_norm': '1.171', 'learning_rate': '4.976e-05', 'epoch': '0.3175', 'num_input_tokens_seen': 25810623, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.601', 'grad_norm': '2.499', 'learning_rate': '4.976e-05', 'epoch': '0.3175', 'num_input_tokens_seen': 25812670, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4567', 'grad_norm': '1.031', 'learning_rate': '4.976e-05', 'epoch': '0.3175', 'num_input_tokens_seen': 25814717, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8065', 'grad_norm': '1.766', 'learning_rate': '4.976e-05', 'epoch': '0.3175', 'num_input_tokens_seen': 25816764, 'train_runtime': '1.306e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3706', 'grad_norm': '0.9133', 'learning_rate': '4.976e-05', 'epoch': '0.3176', 'num_input_tokens_seen': 25818811, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.831', 'grad_norm': '2.85', 'learning_rate': '4.976e-05', 'epoch': '0.3176', 'num_input_tokens_seen': 25820858, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9229', 'grad_norm': '1.677', 'learning_rate': '4.976e-05', 'epoch': '0.3176', 'num_input_tokens_seen': 25822905, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7935', 'grad_norm': '1.152', 'learning_rate': '4.976e-05', 'epoch': '0.3176', 'num_input_tokens_seen': 25824952, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2947', 'grad_norm': '0.908', 'learning_rate': '4.976e-05', 'epoch': '0.3177', 'num_input_tokens_seen': 25826999, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5317', 'grad_norm': '1.155', 'learning_rate': '4.976e-05', 'epoch': '0.3177', 'num_input_tokens_seen': 25829046, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.671', 'grad_norm': '2.796', 'learning_rate': '4.976e-05', 'epoch': '0.3177', 'num_input_tokens_seen': 25831093, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.784', 'grad_norm': '1.331', 'learning_rate': '4.976e-05', 'epoch': '0.3177', 'num_input_tokens_seen': 25833140, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6311', 'grad_norm': '1.26', 'learning_rate': '4.976e-05', 'epoch': '0.3178', 'num_input_tokens_seen': 25835187, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3123', 'grad_norm': '1.374', 'learning_rate': '4.976e-05', 'epoch': '0.3178', 'num_input_tokens_seen': 25837234, 'train_runtime': '1.307e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.976', 'grad_norm': '1.447', 'learning_rate': '4.976e-05', 'epoch': '0.3178', 'num_input_tokens_seen': 25839281, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3011', 'grad_norm': '1.009', 'learning_rate': '4.976e-05', 'epoch': '0.3178', 'num_input_tokens_seen': 25841328, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4531', 'grad_norm': '1.221', 'learning_rate': '4.976e-05', 'epoch': '0.3179', 'num_input_tokens_seen': 25843375, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.02', 'grad_norm': '1.659', 'learning_rate': '4.976e-05', 'epoch': '0.3179', 'num_input_tokens_seen': 25845422, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6469', 'grad_norm': '1.217', 'learning_rate': '4.976e-05', 'epoch': '0.3179', 'num_input_tokens_seen': 25847469, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7405', 'grad_norm': '1.531', 'learning_rate': '4.976e-05', 'epoch': '0.3179', 'num_input_tokens_seen': 25849516, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6513', 'grad_norm': '1.268', 'learning_rate': '4.976e-05', 'epoch': '0.318', 'num_input_tokens_seen': 25851563, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3947', 'grad_norm': '2.221', 'learning_rate': '4.976e-05', 'epoch': '0.318', 'num_input_tokens_seen': 25853610, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.454', 'grad_norm': '2.803', 'learning_rate': '4.976e-05', 'epoch': '0.318', 'num_input_tokens_seen': 25855657, 'train_runtime': '1.308e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.87', 'grad_norm': '2.978', 'learning_rate': '4.976e-05', 'epoch': '0.3181', 'num_input_tokens_seen': 25857704, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.443', 'grad_norm': '1.089', 'learning_rate': '4.976e-05', 'epoch': '0.3181', 'num_input_tokens_seen': 25859751, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.339', 'grad_norm': '0.9119', 'learning_rate': '4.976e-05', 'epoch': '0.3181', 'num_input_tokens_seen': 25861798, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5799', 'grad_norm': '1.232', 'learning_rate': '4.976e-05', 'epoch': '0.3181', 'num_input_tokens_seen': 25863845, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3942', 'grad_norm': '1.046', 'learning_rate': '4.976e-05', 'epoch': '0.3182', 'num_input_tokens_seen': 25865892, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9371', 'grad_norm': '1.932', 'learning_rate': '4.976e-05', 'epoch': '0.3182', 'num_input_tokens_seen': 25867939, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.741', 'grad_norm': '1.196', 'learning_rate': '4.976e-05', 'epoch': '0.3182', 'num_input_tokens_seen': 25869986, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4933', 'grad_norm': '1.051', 'learning_rate': '4.976e-05', 'epoch': '0.3182', 'num_input_tokens_seen': 25872033, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7467', 'grad_norm': '1.089', 'learning_rate': '4.976e-05', 'epoch': '0.3183', 'num_input_tokens_seen': 25874080, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2508', 'grad_norm': '0.8559', 'learning_rate': '4.976e-05', 'epoch': '0.3183', 'num_input_tokens_seen': 25876127, 'train_runtime': '1.309e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5499', 'grad_norm': '1.404', 'learning_rate': '4.976e-05', 'epoch': '0.3183', 'num_input_tokens_seen': 25878174, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.223', 'grad_norm': '1.882', 'learning_rate': '4.976e-05', 'epoch': '0.3183', 'num_input_tokens_seen': 25880221, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.324', 'grad_norm': '2.264', 'learning_rate': '4.976e-05', 'epoch': '0.3184', 'num_input_tokens_seen': 25882268, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3044', 'grad_norm': '0.8624', 'learning_rate': '4.976e-05', 'epoch': '0.3184', 'num_input_tokens_seen': 25884315, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.399', 'grad_norm': '0.9215', 'learning_rate': '4.976e-05', 'epoch': '0.3184', 'num_input_tokens_seen': 25886362, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9211', 'grad_norm': '1.536', 'learning_rate': '4.976e-05', 'epoch': '0.3184', 'num_input_tokens_seen': 25888409, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5419', 'grad_norm': '1.335', 'learning_rate': '4.976e-05', 'epoch': '0.3185', 'num_input_tokens_seen': 25890456, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8455', 'grad_norm': '1.236', 'learning_rate': '4.976e-05', 'epoch': '0.3185', 'num_input_tokens_seen': 25892503, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.086', 'grad_norm': '1.862', 'learning_rate': '4.976e-05', 'epoch': '0.3185', 'num_input_tokens_seen': 25894550, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9804', 'grad_norm': '2.319', 'learning_rate': '4.976e-05', 'epoch': '0.3185', 'num_input_tokens_seen': 25896597, 'train_runtime': '1.31e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8803', 'grad_norm': '2.187', 'learning_rate': '4.976e-05', 'epoch': '0.3186', 'num_input_tokens_seen': 25898644, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6131', 'grad_norm': '1.212', 'learning_rate': '4.975e-05', 'epoch': '0.3186', 'num_input_tokens_seen': 25900691, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6874', 'grad_norm': '1.368', 'learning_rate': '4.975e-05', 'epoch': '0.3186', 'num_input_tokens_seen': 25902738, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.59', 'grad_norm': '2.666', 'learning_rate': '4.975e-05', 'epoch': '0.3186', 'num_input_tokens_seen': 25904785, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6133', 'grad_norm': '1.358', 'learning_rate': '4.975e-05', 'epoch': '0.3187', 'num_input_tokens_seen': 25906832, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1935', 'grad_norm': '0.9662', 'learning_rate': '4.975e-05', 'epoch': '0.3187', 'num_input_tokens_seen': 25908879, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5473', 'grad_norm': '1.281', 'learning_rate': '4.975e-05', 'epoch': '0.3187', 'num_input_tokens_seen': 25910926, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6766', 'grad_norm': '1.333', 'learning_rate': '4.975e-05', 'epoch': '0.3187', 'num_input_tokens_seen': 25912973, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3711', 'grad_norm': '0.9394', 'learning_rate': '4.975e-05', 'epoch': '0.3188', 'num_input_tokens_seen': 25915020, 'train_runtime': '1.311e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3244', 'grad_norm': '1.461', 'learning_rate': '4.975e-05', 'epoch': '0.3188', 'num_input_tokens_seen': 25917067, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5531', 'grad_norm': '1.301', 'learning_rate': '4.975e-05', 'epoch': '0.3188', 'num_input_tokens_seen': 25919114, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3962', 'grad_norm': '1.156', 'learning_rate': '4.975e-05', 'epoch': '0.3188', 'num_input_tokens_seen': 25921161, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6185', 'grad_norm': '1.161', 'learning_rate': '4.975e-05', 'epoch': '0.3189', 'num_input_tokens_seen': 25923208, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2138', 'grad_norm': '0.8636', 'learning_rate': '4.975e-05', 'epoch': '0.3189', 'num_input_tokens_seen': 25925255, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6149', 'grad_norm': '1.294', 'learning_rate': '4.975e-05', 'epoch': '0.3189', 'num_input_tokens_seen': 25927302, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3695', 'grad_norm': '0.9874', 'learning_rate': '4.975e-05', 'epoch': '0.3189', 'num_input_tokens_seen': 25929349, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2743', 'grad_norm': '0.9801', 'learning_rate': '4.975e-05', 'epoch': '0.319', 'num_input_tokens_seen': 25931396, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.561', 'grad_norm': '2.317', 'learning_rate': '4.975e-05', 'epoch': '0.319', 'num_input_tokens_seen': 25933443, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7478', 'grad_norm': '1.164', 'learning_rate': '4.975e-05', 'epoch': '0.319', 'num_input_tokens_seen': 25935490, 'train_runtime': '1.312e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.814', 'grad_norm': '2.35', 'learning_rate': '4.975e-05', 'epoch': '0.319', 'num_input_tokens_seen': 25937537, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.151', 'grad_norm': '2.174', 'learning_rate': '4.975e-05', 'epoch': '0.3191', 'num_input_tokens_seen': 25939584, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.02', 'grad_norm': '1.976', 'learning_rate': '4.975e-05', 'epoch': '0.3191', 'num_input_tokens_seen': 25941631, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8343', 'grad_norm': '1.613', 'learning_rate': '4.975e-05', 'epoch': '0.3191', 'num_input_tokens_seen': 25943678, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8743', 'grad_norm': '1.311', 'learning_rate': '4.975e-05', 'epoch': '0.3191', 'num_input_tokens_seen': 25945725, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9249', 'grad_norm': '2.017', 'learning_rate': '4.975e-05', 'epoch': '0.3192', 'num_input_tokens_seen': 25947772, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2836', 'grad_norm': '0.7729', 'learning_rate': '4.975e-05', 'epoch': '0.3192', 'num_input_tokens_seen': 25949819, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7545', 'grad_norm': '1.151', 'learning_rate': '4.975e-05', 'epoch': '0.3192', 'num_input_tokens_seen': 25951866, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.339', 'grad_norm': '1.113', 'learning_rate': '4.975e-05', 'epoch': '0.3192', 'num_input_tokens_seen': 25953913, 'train_runtime': '1.313e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9475', 'grad_norm': '1.417', 'learning_rate': '4.975e-05', 'epoch': '0.3193', 'num_input_tokens_seen': 25955960, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.883', 'grad_norm': '2.01', 'learning_rate': '4.975e-05', 'epoch': '0.3193', 'num_input_tokens_seen': 25958007, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2644', 'grad_norm': '0.7076', 'learning_rate': '4.975e-05', 'epoch': '0.3193', 'num_input_tokens_seen': 25960054, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5402', 'grad_norm': '1.3', 'learning_rate': '4.975e-05', 'epoch': '0.3193', 'num_input_tokens_seen': 25962101, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.504', 'grad_norm': '2.217', 'learning_rate': '4.975e-05', 'epoch': '0.3194', 'num_input_tokens_seen': 25964148, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9602', 'grad_norm': '1.732', 'learning_rate': '4.975e-05', 'epoch': '0.3194', 'num_input_tokens_seen': 25966195, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.233', 'grad_norm': '1.428', 'learning_rate': '4.975e-05', 'epoch': '0.3194', 'num_input_tokens_seen': 25968242, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3168', 'grad_norm': '0.8366', 'learning_rate': '4.975e-05', 'epoch': '0.3194', 'num_input_tokens_seen': 25970289, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2347', 'grad_norm': '0.8644', 'learning_rate': '4.975e-05', 'epoch': '0.3195', 'num_input_tokens_seen': 25972336, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3646', 'grad_norm': '1.005', 'learning_rate': '4.975e-05', 'epoch': '0.3195', 'num_input_tokens_seen': 25974383, 'train_runtime': '1.314e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2998', 'grad_norm': '0.856', 'learning_rate': '4.975e-05', 'epoch': '0.3195', 'num_input_tokens_seen': 25976430, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4387', 'grad_norm': '1.291', 'learning_rate': '4.975e-05', 'epoch': '0.3195', 'num_input_tokens_seen': 25978477, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9769', 'grad_norm': '1.319', 'learning_rate': '4.975e-05', 'epoch': '0.3196', 'num_input_tokens_seen': 25980524, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2472', 'grad_norm': '0.8004', 'learning_rate': '4.975e-05', 'epoch': '0.3196', 'num_input_tokens_seen': 25982571, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9109', 'grad_norm': '1.537', 'learning_rate': '4.975e-05', 'epoch': '0.3196', 'num_input_tokens_seen': 25984618, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2308', 'grad_norm': '0.711', 'learning_rate': '4.975e-05', 'epoch': '0.3196', 'num_input_tokens_seen': 25986665, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6665', 'grad_norm': '1.256', 'learning_rate': '4.975e-05', 'epoch': '0.3197', 'num_input_tokens_seen': 25988712, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5903', 'grad_norm': '1.2', 'learning_rate': '4.975e-05', 'epoch': '0.3197', 'num_input_tokens_seen': 25990759, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3577', 'grad_norm': '0.8036', 'learning_rate': '4.975e-05', 'epoch': '0.3197', 'num_input_tokens_seen': 25992806, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7462', 'grad_norm': '1.362', 'learning_rate': '4.975e-05', 'epoch': '0.3197', 'num_input_tokens_seen': 25994853, 'train_runtime': '1.315e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6638', 'grad_norm': '1.29', 'learning_rate': '4.975e-05', 'epoch': '0.3198', 'num_input_tokens_seen': 25996900, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2335', 'grad_norm': '0.7989', 'learning_rate': '4.975e-05', 'epoch': '0.3198', 'num_input_tokens_seen': 25998947, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.786', 'grad_norm': '2.551', 'learning_rate': '4.975e-05', 'epoch': '0.3198', 'num_input_tokens_seen': 26000994, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.824', 'grad_norm': '1.301', 'learning_rate': '4.975e-05', 'epoch': '0.3198', 'num_input_tokens_seen': 26003041, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4408', 'grad_norm': '1.052', 'learning_rate': '4.975e-05', 'epoch': '0.3199', 'num_input_tokens_seen': 26005088, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5507', 'grad_norm': '1.319', 'learning_rate': '4.975e-05', 'epoch': '0.3199', 'num_input_tokens_seen': 26007135, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4491', 'grad_norm': '1.063', 'learning_rate': '4.975e-05', 'epoch': '0.3199', 'num_input_tokens_seen': 26009182, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4812', 'grad_norm': '1.186', 'learning_rate': '4.975e-05', 'epoch': '0.3199', 'num_input_tokens_seen': 26011229, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.261', 'grad_norm': '0.8576', 'learning_rate': '4.975e-05', 'epoch': '0.32', 'num_input_tokens_seen': 26013276, 'train_runtime': '1.316e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.232', 'grad_norm': '1.88', 'learning_rate': '4.975e-05', 'epoch': '0.32', 'num_input_tokens_seen': 26015323, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.749', 'grad_norm': '2.669', 'learning_rate': '4.975e-05', 'epoch': '0.32', 'num_input_tokens_seen': 26017370, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.487', 'grad_norm': '0.8656', 'learning_rate': '4.975e-05', 'epoch': '0.32', 'num_input_tokens_seen': 26019417, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4126', 'grad_norm': '0.9825', 'learning_rate': '4.975e-05', 'epoch': '0.3201', 'num_input_tokens_seen': 26021464, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4785', 'grad_norm': '0.9716', 'learning_rate': '4.975e-05', 'epoch': '0.3201', 'num_input_tokens_seen': 26023511, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8804', 'grad_norm': '1.29', 'learning_rate': '4.975e-05', 'epoch': '0.3201', 'num_input_tokens_seen': 26025558, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.52', 'grad_norm': '1.979', 'learning_rate': '4.975e-05', 'epoch': '0.3201', 'num_input_tokens_seen': 26027605, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7296', 'grad_norm': '1.145', 'learning_rate': '4.975e-05', 'epoch': '0.3202', 'num_input_tokens_seen': 26029652, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.976', 'grad_norm': '1.559', 'learning_rate': '4.975e-05', 'epoch': '0.3202', 'num_input_tokens_seen': 26031699, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5241', 'grad_norm': '1.132', 'learning_rate': '4.975e-05', 'epoch': '0.3202', 'num_input_tokens_seen': 26033746, 'train_runtime': '1.317e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.029', 'grad_norm': '2.106', 'learning_rate': '4.975e-05', 'epoch': '0.3202', 'num_input_tokens_seen': 26035793, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2362', 'grad_norm': '0.8934', 'learning_rate': '4.975e-05', 'epoch': '0.3203', 'num_input_tokens_seen': 26037840, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.097', 'grad_norm': '2.358', 'learning_rate': '4.975e-05', 'epoch': '0.3203', 'num_input_tokens_seen': 26039887, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.359', 'grad_norm': '2.497', 'learning_rate': '4.975e-05', 'epoch': '0.3203', 'num_input_tokens_seen': 26041934, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8722', 'grad_norm': '1.525', 'learning_rate': '4.975e-05', 'epoch': '0.3203', 'num_input_tokens_seen': 26043981, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5732', 'grad_norm': '1.318', 'learning_rate': '4.975e-05', 'epoch': '0.3204', 'num_input_tokens_seen': 26046028, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6071', 'grad_norm': '1.288', 'learning_rate': '4.975e-05', 'epoch': '0.3204', 'num_input_tokens_seen': 26048075, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5721', 'grad_norm': '1.692', 'learning_rate': '4.975e-05', 'epoch': '0.3204', 'num_input_tokens_seen': 26050122, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.424', 'grad_norm': '1.122', 'learning_rate': '4.975e-05', 'epoch': '0.3204', 'num_input_tokens_seen': 26052169, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4761', 'grad_norm': '0.9928', 'learning_rate': '4.975e-05', 'epoch': '0.3205', 'num_input_tokens_seen': 26054216, 'train_runtime': '1.318e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.586', 'grad_norm': '1.091', 'learning_rate': '4.975e-05', 'epoch': '0.3205', 'num_input_tokens_seen': 26056263, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4994', 'grad_norm': '1.333', 'learning_rate': '4.975e-05', 'epoch': '0.3205', 'num_input_tokens_seen': 26058310, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3905', 'grad_norm': '1.076', 'learning_rate': '4.975e-05', 'epoch': '0.3205', 'num_input_tokens_seen': 26060357, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4893', 'grad_norm': '1.054', 'learning_rate': '4.975e-05', 'epoch': '0.3206', 'num_input_tokens_seen': 26062404, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3535', 'grad_norm': '1.164', 'learning_rate': '4.975e-05', 'epoch': '0.3206', 'num_input_tokens_seen': 26064451, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3099', 'grad_norm': '0.9257', 'learning_rate': '4.975e-05', 'epoch': '0.3206', 'num_input_tokens_seen': 26066498, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7489', 'grad_norm': '0.999', 'learning_rate': '4.975e-05', 'epoch': '0.3206', 'num_input_tokens_seen': 26068545, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.181', 'grad_norm': '2.199', 'learning_rate': '4.975e-05', 'epoch': '0.3207', 'num_input_tokens_seen': 26070592, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4736', 'grad_norm': '0.8372', 'learning_rate': '4.975e-05', 'epoch': '0.3207', 'num_input_tokens_seen': 26072639, 'train_runtime': '1.319e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9684', 'grad_norm': '1.729', 'learning_rate': '4.975e-05', 'epoch': '0.3207', 'num_input_tokens_seen': 26074686, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6519', 'grad_norm': '1.468', 'learning_rate': '4.975e-05', 'epoch': '0.3207', 'num_input_tokens_seen': 26076733, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.721', 'grad_norm': '1.132', 'learning_rate': '4.975e-05', 'epoch': '0.3208', 'num_input_tokens_seen': 26078780, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5459', 'grad_norm': '1.065', 'learning_rate': '4.975e-05', 'epoch': '0.3208', 'num_input_tokens_seen': 26080827, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6838', 'grad_norm': '1.39', 'learning_rate': '4.975e-05', 'epoch': '0.3208', 'num_input_tokens_seen': 26082874, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.164', 'grad_norm': '1.271', 'learning_rate': '4.975e-05', 'epoch': '0.3208', 'num_input_tokens_seen': 26084921, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8165', 'grad_norm': '1.946', 'learning_rate': '4.975e-05', 'epoch': '0.3209', 'num_input_tokens_seen': 26086968, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5428', 'grad_norm': '1.367', 'learning_rate': '4.975e-05', 'epoch': '0.3209', 'num_input_tokens_seen': 26089015, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.766', 'grad_norm': '4.014', 'learning_rate': '4.975e-05', 'epoch': '0.3209', 'num_input_tokens_seen': 26091062, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6059', 'grad_norm': '1.199', 'learning_rate': '4.975e-05', 'epoch': '0.3209', 'num_input_tokens_seen': 26093109, 'train_runtime': '1.32e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2704', 'grad_norm': '0.7505', 'learning_rate': '4.975e-05', 'epoch': '0.321', 'num_input_tokens_seen': 26095156, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8082', 'grad_norm': '1.28', 'learning_rate': '4.975e-05', 'epoch': '0.321', 'num_input_tokens_seen': 26097203, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3198', 'grad_norm': '0.8964', 'learning_rate': '4.975e-05', 'epoch': '0.321', 'num_input_tokens_seen': 26099250, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.384', 'grad_norm': '2.583', 'learning_rate': '4.975e-05', 'epoch': '0.321', 'num_input_tokens_seen': 26101297, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9189', 'grad_norm': '1.992', 'learning_rate': '4.975e-05', 'epoch': '0.3211', 'num_input_tokens_seen': 26103344, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5957', 'grad_norm': '0.9006', 'learning_rate': '4.975e-05', 'epoch': '0.3211', 'num_input_tokens_seen': 26105391, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.635', 'grad_norm': '1.234', 'learning_rate': '4.975e-05', 'epoch': '0.3211', 'num_input_tokens_seen': 26107438, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9095', 'grad_norm': '1.247', 'learning_rate': '4.975e-05', 'epoch': '0.3211', 'num_input_tokens_seen': 26109485, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.501', 'grad_norm': '2.218', 'learning_rate': '4.975e-05', 'epoch': '0.3212', 'num_input_tokens_seen': 26111532, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4053', 'grad_norm': '0.8418', 'learning_rate': '4.975e-05', 'epoch': '0.3212', 'num_input_tokens_seen': 26113579, 'train_runtime': '1.321e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.511', 'grad_norm': '2.048', 'learning_rate': '4.975e-05', 'epoch': '0.3212', 'num_input_tokens_seen': 26115626, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8999', 'grad_norm': '2.089', 'learning_rate': '4.975e-05', 'epoch': '0.3212', 'num_input_tokens_seen': 26117673, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6346', 'grad_norm': '1.014', 'learning_rate': '4.975e-05', 'epoch': '0.3213', 'num_input_tokens_seen': 26119720, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.289', 'grad_norm': '0.8564', 'learning_rate': '4.975e-05', 'epoch': '0.3213', 'num_input_tokens_seen': 26121767, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.489', 'grad_norm': '2.111', 'learning_rate': '4.975e-05', 'epoch': '0.3213', 'num_input_tokens_seen': 26123814, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3594', 'grad_norm': '0.839', 'learning_rate': '4.975e-05', 'epoch': '0.3213', 'num_input_tokens_seen': 26125861, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3663', 'grad_norm': '0.9371', 'learning_rate': '4.975e-05', 'epoch': '0.3214', 'num_input_tokens_seen': 26127908, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3939', 'grad_norm': '0.9148', 'learning_rate': '4.975e-05', 'epoch': '0.3214', 'num_input_tokens_seen': 26129955, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3942', 'grad_norm': '0.7884', 'learning_rate': '4.975e-05', 'epoch': '0.3214', 'num_input_tokens_seen': 26132002, 'train_runtime': '1.322e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.174', 'grad_norm': '1.974', 'learning_rate': '4.975e-05', 'epoch': '0.3214', 'num_input_tokens_seen': 26134049, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3152', 'grad_norm': '0.875', 'learning_rate': '4.975e-05', 'epoch': '0.3215', 'num_input_tokens_seen': 26136096, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4377', 'grad_norm': '0.8512', 'learning_rate': '4.975e-05', 'epoch': '0.3215', 'num_input_tokens_seen': 26138143, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3079', 'grad_norm': '0.8192', 'learning_rate': '4.975e-05', 'epoch': '0.3215', 'num_input_tokens_seen': 26140190, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5402', 'grad_norm': '1.142', 'learning_rate': '4.975e-05', 'epoch': '0.3215', 'num_input_tokens_seen': 26142237, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5742', 'grad_norm': '0.9892', 'learning_rate': '4.975e-05', 'epoch': '0.3216', 'num_input_tokens_seen': 26144284, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5264', 'grad_norm': '1.326', 'learning_rate': '4.975e-05', 'epoch': '0.3216', 'num_input_tokens_seen': 26146331, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8355', 'grad_norm': '2.057', 'learning_rate': '4.975e-05', 'epoch': '0.3216', 'num_input_tokens_seen': 26148378, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4801', 'grad_norm': '1.117', 'learning_rate': '4.975e-05', 'epoch': '0.3217', 'num_input_tokens_seen': 26150425, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.282', 'grad_norm': '2.364', 'learning_rate': '4.975e-05', 'epoch': '0.3217', 'num_input_tokens_seen': 26152472, 'train_runtime': '1.323e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9599', 'grad_norm': '1.546', 'learning_rate': '4.975e-05', 'epoch': '0.3217', 'num_input_tokens_seen': 26154519, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.45', 'grad_norm': '1.103', 'learning_rate': '4.975e-05', 'epoch': '0.3217', 'num_input_tokens_seen': 26156566, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6672', 'grad_norm': '1.772', 'learning_rate': '4.975e-05', 'epoch': '0.3218', 'num_input_tokens_seen': 26158613, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5789', 'grad_norm': '1.03', 'learning_rate': '4.975e-05', 'epoch': '0.3218', 'num_input_tokens_seen': 26160660, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.33', 'grad_norm': '6.094', 'learning_rate': '4.975e-05', 'epoch': '0.3218', 'num_input_tokens_seen': 26162707, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.513', 'grad_norm': '2.288', 'learning_rate': '4.975e-05', 'epoch': '0.3218', 'num_input_tokens_seen': 26164754, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2583', 'grad_norm': '0.9407', 'learning_rate': '4.975e-05', 'epoch': '0.3219', 'num_input_tokens_seen': 26166801, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3713', 'grad_norm': '0.9526', 'learning_rate': '4.975e-05', 'epoch': '0.3219', 'num_input_tokens_seen': 26168848, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7142', 'grad_norm': '1.554', 'learning_rate': '4.975e-05', 'epoch': '0.3219', 'num_input_tokens_seen': 26170895, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2498', 'grad_norm': '0.8519', 'learning_rate': '4.975e-05', 'epoch': '0.3219', 'num_input_tokens_seen': 26172942, 'train_runtime': '1.324e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8344', 'grad_norm': '1.128', 'learning_rate': '4.975e-05', 'epoch': '0.322', 'num_input_tokens_seen': 26174989, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.232', 'grad_norm': '0.7691', 'learning_rate': '4.975e-05', 'epoch': '0.322', 'num_input_tokens_seen': 26177036, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.458', 'grad_norm': '2.951', 'learning_rate': '4.975e-05', 'epoch': '0.322', 'num_input_tokens_seen': 26179083, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6495', 'grad_norm': '1.143', 'learning_rate': '4.975e-05', 'epoch': '0.322', 'num_input_tokens_seen': 26181130, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5644', 'grad_norm': '0.9997', 'learning_rate': '4.975e-05', 'epoch': '0.3221', 'num_input_tokens_seen': 26183177, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5097', 'grad_norm': '1.156', 'learning_rate': '4.975e-05', 'epoch': '0.3221', 'num_input_tokens_seen': 26185224, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9432', 'grad_norm': '1.602', 'learning_rate': '4.975e-05', 'epoch': '0.3221', 'num_input_tokens_seen': 26187271, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2169', 'grad_norm': '0.9767', 'learning_rate': '4.975e-05', 'epoch': '0.3221', 'num_input_tokens_seen': 26189318, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4147', 'grad_norm': '1.086', 'learning_rate': '4.975e-05', 'epoch': '0.3222', 'num_input_tokens_seen': 26191365, 'train_runtime': '1.325e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.167', 'grad_norm': '2.002', 'learning_rate': '4.975e-05', 'epoch': '0.3222', 'num_input_tokens_seen': 26193412, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8001', 'grad_norm': '1.483', 'learning_rate': '4.975e-05', 'epoch': '0.3222', 'num_input_tokens_seen': 26195459, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9973', 'grad_norm': '1.966', 'learning_rate': '4.975e-05', 'epoch': '0.3222', 'num_input_tokens_seen': 26197506, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6581', 'grad_norm': '1.608', 'learning_rate': '4.975e-05', 'epoch': '0.3223', 'num_input_tokens_seen': 26199553, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5365', 'grad_norm': '1.095', 'learning_rate': '4.975e-05', 'epoch': '0.3223', 'num_input_tokens_seen': 26201600, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.405', 'grad_norm': '1.022', 'learning_rate': '4.975e-05', 'epoch': '0.3223', 'num_input_tokens_seen': 26203647, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7271', 'grad_norm': '1.915', 'learning_rate': '4.975e-05', 'epoch': '0.3223', 'num_input_tokens_seen': 26205694, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3245', 'grad_norm': '0.7964', 'learning_rate': '4.975e-05', 'epoch': '0.3224', 'num_input_tokens_seen': 26207741, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4227', 'grad_norm': '1.06', 'learning_rate': '4.975e-05', 'epoch': '0.3224', 'num_input_tokens_seen': 26209788, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7601', 'grad_norm': '1.333', 'learning_rate': '4.975e-05', 'epoch': '0.3224', 'num_input_tokens_seen': 26211835, 'train_runtime': '1.326e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.016', 'grad_norm': '1.865', 'learning_rate': '4.975e-05', 'epoch': '0.3224', 'num_input_tokens_seen': 26213882, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6087', 'grad_norm': '1.186', 'learning_rate': '4.975e-05', 'epoch': '0.3225', 'num_input_tokens_seen': 26215929, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4872', 'grad_norm': '1.108', 'learning_rate': '4.975e-05', 'epoch': '0.3225', 'num_input_tokens_seen': 26217976, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7911', 'grad_norm': '1.352', 'learning_rate': '4.975e-05', 'epoch': '0.3225', 'num_input_tokens_seen': 26220023, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.499', 'grad_norm': '2.733', 'learning_rate': '4.975e-05', 'epoch': '0.3225', 'num_input_tokens_seen': 26222070, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3856', 'grad_norm': '0.8587', 'learning_rate': '4.975e-05', 'epoch': '0.3226', 'num_input_tokens_seen': 26224117, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7472', 'grad_norm': '1.423', 'learning_rate': '4.975e-05', 'epoch': '0.3226', 'num_input_tokens_seen': 26226164, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.697', 'grad_norm': '1.307', 'learning_rate': '4.975e-05', 'epoch': '0.3226', 'num_input_tokens_seen': 26228211, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4903', 'grad_norm': '1.142', 'learning_rate': '4.975e-05', 'epoch': '0.3226', 'num_input_tokens_seen': 26230258, 'train_runtime': '1.327e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.091', 'grad_norm': '2.106', 'learning_rate': '4.975e-05', 'epoch': '0.3227', 'num_input_tokens_seen': 26232305, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4872', 'grad_norm': '1.114', 'learning_rate': '4.975e-05', 'epoch': '0.3227', 'num_input_tokens_seen': 26234352, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1957', 'grad_norm': '0.8676', 'learning_rate': '4.975e-05', 'epoch': '0.3227', 'num_input_tokens_seen': 26236399, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.556', 'grad_norm': '1.339', 'learning_rate': '4.975e-05', 'epoch': '0.3227', 'num_input_tokens_seen': 26238446, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3746', 'grad_norm': '0.8815', 'learning_rate': '4.975e-05', 'epoch': '0.3228', 'num_input_tokens_seen': 26240493, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8376', 'grad_norm': '1.82', 'learning_rate': '4.975e-05', 'epoch': '0.3228', 'num_input_tokens_seen': 26242540, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2568', 'grad_norm': '0.7938', 'learning_rate': '4.975e-05', 'epoch': '0.3228', 'num_input_tokens_seen': 26244587, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4255', 'grad_norm': '1.089', 'learning_rate': '4.975e-05', 'epoch': '0.3228', 'num_input_tokens_seen': 26246634, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.528', 'grad_norm': '3.126', 'learning_rate': '4.975e-05', 'epoch': '0.3229', 'num_input_tokens_seen': 26248681, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3571', 'grad_norm': '0.9442', 'learning_rate': '4.975e-05', 'epoch': '0.3229', 'num_input_tokens_seen': 26250728, 'train_runtime': '1.328e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.555', 'grad_norm': '2.628', 'learning_rate': '4.975e-05', 'epoch': '0.3229', 'num_input_tokens_seen': 26252775, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7775', 'grad_norm': '1.369', 'learning_rate': '4.975e-05', 'epoch': '0.3229', 'num_input_tokens_seen': 26254822, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2944', 'grad_norm': '0.8437', 'learning_rate': '4.975e-05', 'epoch': '0.323', 'num_input_tokens_seen': 26256869, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5248', 'grad_norm': '1.473', 'learning_rate': '4.975e-05', 'epoch': '0.323', 'num_input_tokens_seen': 26258916, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3717', 'grad_norm': '1.069', 'learning_rate': '4.974e-05', 'epoch': '0.323', 'num_input_tokens_seen': 26260963, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6014', 'grad_norm': '1.498', 'learning_rate': '4.974e-05', 'epoch': '0.323', 'num_input_tokens_seen': 26263010, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2578', 'grad_norm': '0.7939', 'learning_rate': '4.974e-05', 'epoch': '0.3231', 'num_input_tokens_seen': 26265057, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.222', 'grad_norm': '2.103', 'learning_rate': '4.974e-05', 'epoch': '0.3231', 'num_input_tokens_seen': 26267104, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5561', 'grad_norm': '1.176', 'learning_rate': '4.974e-05', 'epoch': '0.3231', 'num_input_tokens_seen': 26269151, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7651', 'grad_norm': '1.504', 'learning_rate': '4.974e-05', 'epoch': '0.3231', 'num_input_tokens_seen': 26271198, 'train_runtime': '1.329e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3138', 'grad_norm': '0.9195', 'learning_rate': '4.974e-05', 'epoch': '0.3232', 'num_input_tokens_seen': 26273245, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2412', 'grad_norm': '0.8052', 'learning_rate': '4.974e-05', 'epoch': '0.3232', 'num_input_tokens_seen': 26275292, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7221', 'grad_norm': '1.284', 'learning_rate': '4.974e-05', 'epoch': '0.3232', 'num_input_tokens_seen': 26277339, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3279', 'grad_norm': '0.8346', 'learning_rate': '4.974e-05', 'epoch': '0.3232', 'num_input_tokens_seen': 26279386, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4177', 'grad_norm': '1.08', 'learning_rate': '4.974e-05', 'epoch': '0.3233', 'num_input_tokens_seen': 26281433, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2389', 'grad_norm': '0.9052', 'learning_rate': '4.974e-05', 'epoch': '0.3233', 'num_input_tokens_seen': 26283480, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.198', 'grad_norm': '1.869', 'learning_rate': '4.974e-05', 'epoch': '0.3233', 'num_input_tokens_seen': 26285527, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7126', 'grad_norm': '1.279', 'learning_rate': '4.974e-05', 'epoch': '0.3233', 'num_input_tokens_seen': 26287574, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.334', 'grad_norm': '1.837', 'learning_rate': '4.974e-05', 'epoch': '0.3234', 'num_input_tokens_seen': 26289621, 'train_runtime': '1.33e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.08', 'grad_norm': '1.358', 'learning_rate': '4.974e-05', 'epoch': '0.3234', 'num_input_tokens_seen': 26291668, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.291', 'grad_norm': '2.204', 'learning_rate': '4.974e-05', 'epoch': '0.3234', 'num_input_tokens_seen': 26293715, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4005', 'grad_norm': '1.006', 'learning_rate': '4.974e-05', 'epoch': '0.3234', 'num_input_tokens_seen': 26295762, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4699', 'grad_norm': '0.9884', 'learning_rate': '4.974e-05', 'epoch': '0.3235', 'num_input_tokens_seen': 26297809, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4598', 'grad_norm': '1.047', 'learning_rate': '4.974e-05', 'epoch': '0.3235', 'num_input_tokens_seen': 26299856, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8445', 'grad_norm': '2.299', 'learning_rate': '4.974e-05', 'epoch': '0.3235', 'num_input_tokens_seen': 26301903, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2691', 'grad_norm': '0.7497', 'learning_rate': '4.974e-05', 'epoch': '0.3235', 'num_input_tokens_seen': 26303950, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7314', 'grad_norm': '1.502', 'learning_rate': '4.974e-05', 'epoch': '0.3236', 'num_input_tokens_seen': 26305997, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3882', 'grad_norm': '0.9626', 'learning_rate': '4.974e-05', 'epoch': '0.3236', 'num_input_tokens_seen': 26308044, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6044', 'grad_norm': '1.27', 'learning_rate': '4.974e-05', 'epoch': '0.3236', 'num_input_tokens_seen': 26310091, 'train_runtime': '1.331e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3047', 'grad_norm': '0.8922', 'learning_rate': '4.974e-05', 'epoch': '0.3236', 'num_input_tokens_seen': 26312138, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2927', 'grad_norm': '0.7214', 'learning_rate': '4.974e-05', 'epoch': '0.3237', 'num_input_tokens_seen': 26314185, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.046', 'grad_norm': '1.275', 'learning_rate': '4.974e-05', 'epoch': '0.3237', 'num_input_tokens_seen': 26316232, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4298', 'grad_norm': '0.9286', 'learning_rate': '4.974e-05', 'epoch': '0.3237', 'num_input_tokens_seen': 26318279, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2792', 'grad_norm': '0.8407', 'learning_rate': '4.974e-05', 'epoch': '0.3237', 'num_input_tokens_seen': 26320326, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.408', 'grad_norm': '0.8384', 'learning_rate': '4.974e-05', 'epoch': '0.3238', 'num_input_tokens_seen': 26322373, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.214', 'grad_norm': '1.642', 'learning_rate': '4.974e-05', 'epoch': '0.3238', 'num_input_tokens_seen': 26324420, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8617', 'grad_norm': '1.439', 'learning_rate': '4.974e-05', 'epoch': '0.3238', 'num_input_tokens_seen': 26326467, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2629', 'grad_norm': '0.8676', 'learning_rate': '4.974e-05', 'epoch': '0.3238', 'num_input_tokens_seen': 26328514, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4315', 'grad_norm': '1.155', 'learning_rate': '4.974e-05', 'epoch': '0.3239', 'num_input_tokens_seen': 26330561, 'train_runtime': '1.332e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4046', 'grad_norm': '0.8508', 'learning_rate': '4.974e-05', 'epoch': '0.3239', 'num_input_tokens_seen': 26332608, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8954', 'grad_norm': '1.281', 'learning_rate': '4.974e-05', 'epoch': '0.3239', 'num_input_tokens_seen': 26334655, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3031', 'grad_norm': '0.882', 'learning_rate': '4.974e-05', 'epoch': '0.3239', 'num_input_tokens_seen': 26336702, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8581', 'grad_norm': '1.523', 'learning_rate': '4.974e-05', 'epoch': '0.324', 'num_input_tokens_seen': 26338749, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6689', 'grad_norm': '1.586', 'learning_rate': '4.974e-05', 'epoch': '0.324', 'num_input_tokens_seen': 26340796, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3943', 'grad_norm': '0.8228', 'learning_rate': '4.974e-05', 'epoch': '0.324', 'num_input_tokens_seen': 26342843, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.715', 'grad_norm': '1.527', 'learning_rate': '4.974e-05', 'epoch': '0.324', 'num_input_tokens_seen': 26344890, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.112', 'grad_norm': '2.103', 'learning_rate': '4.974e-05', 'epoch': '0.3241', 'num_input_tokens_seen': 26346937, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6336', 'grad_norm': '1.114', 'learning_rate': '4.974e-05', 'epoch': '0.3241', 'num_input_tokens_seen': 26348984, 'train_runtime': '1.333e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2126', 'grad_norm': '0.7499', 'learning_rate': '4.974e-05', 'epoch': '0.3241', 'num_input_tokens_seen': 26351031, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.691', 'grad_norm': '2.905', 'learning_rate': '4.974e-05', 'epoch': '0.3241', 'num_input_tokens_seen': 26353078, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3104', 'grad_norm': '1.283', 'learning_rate': '4.974e-05', 'epoch': '0.3242', 'num_input_tokens_seen': 26355125, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.406', 'grad_norm': '2.572', 'learning_rate': '4.974e-05', 'epoch': '0.3242', 'num_input_tokens_seen': 26357172, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7021', 'grad_norm': '1.4', 'learning_rate': '4.974e-05', 'epoch': '0.3242', 'num_input_tokens_seen': 26359219, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9456', 'grad_norm': '1.725', 'learning_rate': '4.974e-05', 'epoch': '0.3242', 'num_input_tokens_seen': 26361266, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7293', 'grad_norm': '1.347', 'learning_rate': '4.974e-05', 'epoch': '0.3243', 'num_input_tokens_seen': 26363313, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7187', 'grad_norm': '1.098', 'learning_rate': '4.974e-05', 'epoch': '0.3243', 'num_input_tokens_seen': 26365360, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4908', 'grad_norm': '1.098', 'learning_rate': '4.974e-05', 'epoch': '0.3243', 'num_input_tokens_seen': 26367407, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9231', 'grad_norm': '1.754', 'learning_rate': '4.974e-05', 'epoch': '0.3243', 'num_input_tokens_seen': 26369454, 'train_runtime': '1.334e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4771', 'grad_norm': '1.05', 'learning_rate': '4.974e-05', 'epoch': '0.3244', 'num_input_tokens_seen': 26371501, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.009', 'grad_norm': '2.215', 'learning_rate': '4.974e-05', 'epoch': '0.3244', 'num_input_tokens_seen': 26373548, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3275', 'grad_norm': '1.075', 'learning_rate': '4.974e-05', 'epoch': '0.3244', 'num_input_tokens_seen': 26375595, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3528', 'grad_norm': '0.8852', 'learning_rate': '4.974e-05', 'epoch': '0.3244', 'num_input_tokens_seen': 26377642, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.064', 'grad_norm': '2.38', 'learning_rate': '4.974e-05', 'epoch': '0.3245', 'num_input_tokens_seen': 26379689, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4768', 'grad_norm': '1.117', 'learning_rate': '4.974e-05', 'epoch': '0.3245', 'num_input_tokens_seen': 26381736, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3875', 'grad_norm': '0.8515', 'learning_rate': '4.974e-05', 'epoch': '0.3245', 'num_input_tokens_seen': 26383783, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4335', 'grad_norm': '0.9279', 'learning_rate': '4.974e-05', 'epoch': '0.3245', 'num_input_tokens_seen': 26385830, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7702', 'grad_norm': '1.272', 'learning_rate': '4.974e-05', 'epoch': '0.3246', 'num_input_tokens_seen': 26387877, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.171', 'grad_norm': '2.131', 'learning_rate': '4.974e-05', 'epoch': '0.3246', 'num_input_tokens_seen': 26389924, 'train_runtime': '1.335e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7933', 'grad_norm': '1.603', 'learning_rate': '4.974e-05', 'epoch': '0.3246', 'num_input_tokens_seen': 26391971, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9112', 'grad_norm': '1.583', 'learning_rate': '4.974e-05', 'epoch': '0.3246', 'num_input_tokens_seen': 26394018, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5455', 'grad_norm': '0.9007', 'learning_rate': '4.974e-05', 'epoch': '0.3247', 'num_input_tokens_seen': 26396065, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.619', 'grad_norm': '1.179', 'learning_rate': '4.974e-05', 'epoch': '0.3247', 'num_input_tokens_seen': 26398112, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3028', 'grad_norm': '0.8749', 'learning_rate': '4.974e-05', 'epoch': '0.3247', 'num_input_tokens_seen': 26400159, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2737', 'grad_norm': '0.9525', 'learning_rate': '4.974e-05', 'epoch': '0.3247', 'num_input_tokens_seen': 26402206, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.168', 'grad_norm': '1.931', 'learning_rate': '4.974e-05', 'epoch': '0.3248', 'num_input_tokens_seen': 26404253, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5069', 'grad_norm': '1.274', 'learning_rate': '4.974e-05', 'epoch': '0.3248', 'num_input_tokens_seen': 26406300, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8505', 'grad_norm': '1.592', 'learning_rate': '4.974e-05', 'epoch': '0.3248', 'num_input_tokens_seen': 26408347, 'train_runtime': '1.336e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.28', 'grad_norm': '0.8488', 'learning_rate': '4.974e-05', 'epoch': '0.3248', 'num_input_tokens_seen': 26410394, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3571', 'grad_norm': '1.06', 'learning_rate': '4.974e-05', 'epoch': '0.3249', 'num_input_tokens_seen': 26412441, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1407', 'grad_norm': '0.8196', 'learning_rate': '4.974e-05', 'epoch': '0.3249', 'num_input_tokens_seen': 26414488, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6538', 'grad_norm': '1.306', 'learning_rate': '4.974e-05', 'epoch': '0.3249', 'num_input_tokens_seen': 26416535, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7262', 'grad_norm': '1.942', 'learning_rate': '4.974e-05', 'epoch': '0.3249', 'num_input_tokens_seen': 26418582, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.05', 'grad_norm': '1.78', 'learning_rate': '4.974e-05', 'epoch': '0.325', 'num_input_tokens_seen': 26420629, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7427', 'grad_norm': '1.236', 'learning_rate': '4.974e-05', 'epoch': '0.325', 'num_input_tokens_seen': 26422676, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4709', 'grad_norm': '1.121', 'learning_rate': '4.974e-05', 'epoch': '0.325', 'num_input_tokens_seen': 26424723, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5642', 'grad_norm': '1.41', 'learning_rate': '4.974e-05', 'epoch': '0.325', 'num_input_tokens_seen': 26426770, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5315', 'grad_norm': '1.344', 'learning_rate': '4.974e-05', 'epoch': '0.3251', 'num_input_tokens_seen': 26428817, 'train_runtime': '1.337e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.532', 'grad_norm': '2.543', 'learning_rate': '4.974e-05', 'epoch': '0.3251', 'num_input_tokens_seen': 26430864, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7855', 'grad_norm': '1.463', 'learning_rate': '4.974e-05', 'epoch': '0.3251', 'num_input_tokens_seen': 26432911, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.643', 'grad_norm': '2.58', 'learning_rate': '4.974e-05', 'epoch': '0.3252', 'num_input_tokens_seen': 26434958, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9867', 'grad_norm': '1.234', 'learning_rate': '4.974e-05', 'epoch': '0.3252', 'num_input_tokens_seen': 26437005, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2083', 'grad_norm': '0.8846', 'learning_rate': '4.974e-05', 'epoch': '0.3252', 'num_input_tokens_seen': 26439052, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.032', 'grad_norm': '3.626', 'learning_rate': '4.974e-05', 'epoch': '0.3252', 'num_input_tokens_seen': 26441099, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3991', 'grad_norm': '1.061', 'learning_rate': '4.974e-05', 'epoch': '0.3253', 'num_input_tokens_seen': 26443146, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2503', 'grad_norm': '0.7747', 'learning_rate': '4.974e-05', 'epoch': '0.3253', 'num_input_tokens_seen': 26445193, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.723', 'grad_norm': '1.399', 'learning_rate': '4.974e-05', 'epoch': '0.3253', 'num_input_tokens_seen': 26447240, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4324', 'grad_norm': '1.375', 'learning_rate': '4.974e-05', 'epoch': '0.3253', 'num_input_tokens_seen': 26449287, 'train_runtime': '1.338e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6124', 'grad_norm': '0.9704', 'learning_rate': '4.974e-05', 'epoch': '0.3254', 'num_input_tokens_seen': 26451334, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5022', 'grad_norm': '1.393', 'learning_rate': '4.974e-05', 'epoch': '0.3254', 'num_input_tokens_seen': 26453381, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3224', 'grad_norm': '0.8456', 'learning_rate': '4.974e-05', 'epoch': '0.3254', 'num_input_tokens_seen': 26455428, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.451', 'grad_norm': '2.153', 'learning_rate': '4.974e-05', 'epoch': '0.3254', 'num_input_tokens_seen': 26457475, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4056', 'grad_norm': '0.9247', 'learning_rate': '4.974e-05', 'epoch': '0.3255', 'num_input_tokens_seen': 26459522, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2914', 'grad_norm': '0.7996', 'learning_rate': '4.974e-05', 'epoch': '0.3255', 'num_input_tokens_seen': 26461569, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.463', 'grad_norm': '2.359', 'learning_rate': '4.974e-05', 'epoch': '0.3255', 'num_input_tokens_seen': 26463616, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2332', 'grad_norm': '0.8271', 'learning_rate': '4.974e-05', 'epoch': '0.3255', 'num_input_tokens_seen': 26465663, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6254', 'grad_norm': '0.9892', 'learning_rate': '4.974e-05', 'epoch': '0.3256', 'num_input_tokens_seen': 26467710, 'train_runtime': '1.339e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.135', 'grad_norm': '1.932', 'learning_rate': '4.974e-05', 'epoch': '0.3256', 'num_input_tokens_seen': 26469757, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3514', 'grad_norm': '0.9451', 'learning_rate': '4.974e-05', 'epoch': '0.3256', 'num_input_tokens_seen': 26471804, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6967', 'grad_norm': '1.33', 'learning_rate': '4.974e-05', 'epoch': '0.3256', 'num_input_tokens_seen': 26473851, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.245', 'grad_norm': '0.8592', 'learning_rate': '4.974e-05', 'epoch': '0.3257', 'num_input_tokens_seen': 26475898, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9646', 'grad_norm': '1.896', 'learning_rate': '4.974e-05', 'epoch': '0.3257', 'num_input_tokens_seen': 26477945, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3028', 'grad_norm': '0.7283', 'learning_rate': '4.974e-05', 'epoch': '0.3257', 'num_input_tokens_seen': 26479992, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7654', 'grad_norm': '1.23', 'learning_rate': '4.974e-05', 'epoch': '0.3257', 'num_input_tokens_seen': 26482039, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4997', 'grad_norm': '1.37', 'learning_rate': '4.974e-05', 'epoch': '0.3258', 'num_input_tokens_seen': 26484086, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.751', 'grad_norm': '2.661', 'learning_rate': '4.974e-05', 'epoch': '0.3258', 'num_input_tokens_seen': 26486133, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6801', 'grad_norm': '0.9514', 'learning_rate': '4.974e-05', 'epoch': '0.3258', 'num_input_tokens_seen': 26488180, 'train_runtime': '1.34e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.541', 'grad_norm': '1.11', 'learning_rate': '4.974e-05', 'epoch': '0.3258', 'num_input_tokens_seen': 26490227, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.333', 'grad_norm': '0.8131', 'learning_rate': '4.974e-05', 'epoch': '0.3259', 'num_input_tokens_seen': 26492274, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7136', 'grad_norm': '1.219', 'learning_rate': '4.974e-05', 'epoch': '0.3259', 'num_input_tokens_seen': 26494321, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.104', 'grad_norm': '2.016', 'learning_rate': '4.974e-05', 'epoch': '0.3259', 'num_input_tokens_seen': 26496368, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.028', 'grad_norm': '1.176', 'learning_rate': '4.974e-05', 'epoch': '0.3259', 'num_input_tokens_seen': 26498415, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7465', 'grad_norm': '1.346', 'learning_rate': '4.974e-05', 'epoch': '0.326', 'num_input_tokens_seen': 26500462, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2992', 'grad_norm': '0.9344', 'learning_rate': '4.974e-05', 'epoch': '0.326', 'num_input_tokens_seen': 26502509, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6141', 'grad_norm': '1.253', 'learning_rate': '4.974e-05', 'epoch': '0.326', 'num_input_tokens_seen': 26504556, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4315', 'grad_norm': '1.434', 'learning_rate': '4.974e-05', 'epoch': '0.326', 'num_input_tokens_seen': 26506603, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.847', 'grad_norm': '1.46', 'learning_rate': '4.974e-05', 'epoch': '0.3261', 'num_input_tokens_seen': 26508650, 'train_runtime': '1.341e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5759', 'grad_norm': '1.281', 'learning_rate': '4.974e-05', 'epoch': '0.3261', 'num_input_tokens_seen': 26510697, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5239', 'grad_norm': '1.237', 'learning_rate': '4.974e-05', 'epoch': '0.3261', 'num_input_tokens_seen': 26512744, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3388', 'grad_norm': '1.08', 'learning_rate': '4.974e-05', 'epoch': '0.3261', 'num_input_tokens_seen': 26514791, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2763', 'grad_norm': '0.821', 'learning_rate': '4.974e-05', 'epoch': '0.3262', 'num_input_tokens_seen': 26516838, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2631', 'grad_norm': '0.9928', 'learning_rate': '4.974e-05', 'epoch': '0.3262', 'num_input_tokens_seen': 26518885, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6865', 'grad_norm': '1.322', 'learning_rate': '4.974e-05', 'epoch': '0.3262', 'num_input_tokens_seen': 26520932, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.029', 'grad_norm': '1.859', 'learning_rate': '4.974e-05', 'epoch': '0.3262', 'num_input_tokens_seen': 26522979, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6824', 'grad_norm': '1.693', 'learning_rate': '4.974e-05', 'epoch': '0.3263', 'num_input_tokens_seen': 26525026, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3678', 'grad_norm': '0.8535', 'learning_rate': '4.974e-05', 'epoch': '0.3263', 'num_input_tokens_seen': 26527073, 'train_runtime': '1.342e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.303', 'grad_norm': '2.855', 'learning_rate': '4.974e-05', 'epoch': '0.3263', 'num_input_tokens_seen': 26529120, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6138', 'grad_norm': '1.05', 'learning_rate': '4.974e-05', 'epoch': '0.3263', 'num_input_tokens_seen': 26531167, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5652', 'grad_norm': '1.391', 'learning_rate': '4.974e-05', 'epoch': '0.3264', 'num_input_tokens_seen': 26533214, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6012', 'grad_norm': '1.263', 'learning_rate': '4.974e-05', 'epoch': '0.3264', 'num_input_tokens_seen': 26535261, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.361', 'grad_norm': '0.9182', 'learning_rate': '4.974e-05', 'epoch': '0.3264', 'num_input_tokens_seen': 26537308, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6338', 'grad_norm': '1.343', 'learning_rate': '4.974e-05', 'epoch': '0.3264', 'num_input_tokens_seen': 26539355, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.147', 'grad_norm': '1.761', 'learning_rate': '4.974e-05', 'epoch': '0.3265', 'num_input_tokens_seen': 26541402, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7614', 'grad_norm': '1.352', 'learning_rate': '4.974e-05', 'epoch': '0.3265', 'num_input_tokens_seen': 26543449, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7187', 'grad_norm': '1.395', 'learning_rate': '4.974e-05', 'epoch': '0.3265', 'num_input_tokens_seen': 26545496, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.15', 'grad_norm': '1.846', 'learning_rate': '4.974e-05', 'epoch': '0.3265', 'num_input_tokens_seen': 26547543, 'train_runtime': '1.343e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6582', 'grad_norm': '1.235', 'learning_rate': '4.974e-05', 'epoch': '0.3266', 'num_input_tokens_seen': 26549590, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4703', 'grad_norm': '1.04', 'learning_rate': '4.974e-05', 'epoch': '0.3266', 'num_input_tokens_seen': 26551637, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6649', 'grad_norm': '1.28', 'learning_rate': '4.974e-05', 'epoch': '0.3266', 'num_input_tokens_seen': 26553684, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3414', 'grad_norm': '0.8774', 'learning_rate': '4.974e-05', 'epoch': '0.3266', 'num_input_tokens_seen': 26555731, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7561', 'grad_norm': '1.289', 'learning_rate': '4.974e-05', 'epoch': '0.3267', 'num_input_tokens_seen': 26557778, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.339', 'grad_norm': '0.952', 'learning_rate': '4.974e-05', 'epoch': '0.3267', 'num_input_tokens_seen': 26559825, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5477', 'grad_norm': '1.453', 'learning_rate': '4.974e-05', 'epoch': '0.3267', 'num_input_tokens_seen': 26561872, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8531', 'grad_norm': '1.162', 'learning_rate': '4.974e-05', 'epoch': '0.3267', 'num_input_tokens_seen': 26563919, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.355', 'grad_norm': '0.9383', 'learning_rate': '4.974e-05', 'epoch': '0.3268', 'num_input_tokens_seen': 26565966, 'train_runtime': '1.344e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8975', 'grad_norm': '1.524', 'learning_rate': '4.974e-05', 'epoch': '0.3268', 'num_input_tokens_seen': 26568013, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4354', 'grad_norm': '1.164', 'learning_rate': '4.974e-05', 'epoch': '0.3268', 'num_input_tokens_seen': 26570060, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8611', 'grad_norm': '1.39', 'learning_rate': '4.974e-05', 'epoch': '0.3268', 'num_input_tokens_seen': 26572107, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.815', 'grad_norm': '2.624', 'learning_rate': '4.974e-05', 'epoch': '0.3269', 'num_input_tokens_seen': 26574154, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8219', 'grad_norm': '1.049', 'learning_rate': '4.974e-05', 'epoch': '0.3269', 'num_input_tokens_seen': 26576201, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7282', 'grad_norm': '1.172', 'learning_rate': '4.974e-05', 'epoch': '0.3269', 'num_input_tokens_seen': 26578248, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5473', 'grad_norm': '1.296', 'learning_rate': '4.974e-05', 'epoch': '0.3269', 'num_input_tokens_seen': 26580295, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3138', 'grad_norm': '0.8858', 'learning_rate': '4.974e-05', 'epoch': '0.327', 'num_input_tokens_seen': 26582342, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3117', 'grad_norm': '0.8156', 'learning_rate': '4.974e-05', 'epoch': '0.327', 'num_input_tokens_seen': 26584389, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.289', 'grad_norm': '1.984', 'learning_rate': '4.974e-05', 'epoch': '0.327', 'num_input_tokens_seen': 26586436, 'train_runtime': '1.345e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6378', 'grad_norm': '1.54', 'learning_rate': '4.974e-05', 'epoch': '0.327', 'num_input_tokens_seen': 26588483, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7858', 'grad_norm': '1.383', 'learning_rate': '4.974e-05', 'epoch': '0.3271', 'num_input_tokens_seen': 26590530, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2885', 'grad_norm': '0.8615', 'learning_rate': '4.974e-05', 'epoch': '0.3271', 'num_input_tokens_seen': 26592577, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8695', 'grad_norm': '1.79', 'learning_rate': '4.974e-05', 'epoch': '0.3271', 'num_input_tokens_seen': 26594624, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.003', 'grad_norm': '1.784', 'learning_rate': '4.974e-05', 'epoch': '0.3271', 'num_input_tokens_seen': 26596671, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.671', 'grad_norm': '2.89', 'learning_rate': '4.974e-05', 'epoch': '0.3272', 'num_input_tokens_seen': 26598718, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.604', 'grad_norm': '1.268', 'learning_rate': '4.974e-05', 'epoch': '0.3272', 'num_input_tokens_seen': 26600765, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9888', 'grad_norm': '1.385', 'learning_rate': '4.974e-05', 'epoch': '0.3272', 'num_input_tokens_seen': 26602812, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8984', 'grad_norm': '1.463', 'learning_rate': '4.974e-05', 'epoch': '0.3272', 'num_input_tokens_seen': 26604859, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.603', 'grad_norm': '0.9585', 'learning_rate': '4.974e-05', 'epoch': '0.3273', 'num_input_tokens_seen': 26606906, 'train_runtime': '1.346e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5338', 'grad_norm': '1.321', 'learning_rate': '4.974e-05', 'epoch': '0.3273', 'num_input_tokens_seen': 26608953, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6666', 'grad_norm': '1.062', 'learning_rate': '4.974e-05', 'epoch': '0.3273', 'num_input_tokens_seen': 26611000, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +[INFO|configuration_utils.py:665] 2026-02-05 06:21:52,288 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 06:21:52,288 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 06:21:52,738 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-13000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 06:21:52,744 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-13000/tokenizer_config.json + +{'loss': '0.5337', 'grad_norm': '1.339', 'learning_rate': '4.973e-05', 'epoch': '0.3273', 'num_input_tokens_seen': 26613047, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2938', 'grad_norm': '0.7477', 'learning_rate': '4.973e-05', 'epoch': '0.3274', 'num_input_tokens_seen': 26615094, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4881', 'grad_norm': '1.368', 'learning_rate': '4.973e-05', 'epoch': '0.3274', 'num_input_tokens_seen': 26617141, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4926', 'grad_norm': '1.315', 'learning_rate': '4.973e-05', 'epoch': '0.3274', 'num_input_tokens_seen': 26619188, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4325', 'grad_norm': '1.119', 'learning_rate': '4.973e-05', 'epoch': '0.3274', 'num_input_tokens_seen': 26621235, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2455', 'grad_norm': '0.8722', 'learning_rate': '4.973e-05', 'epoch': '0.3275', 'num_input_tokens_seen': 26623282, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3552', 'grad_norm': '0.7404', 'learning_rate': '4.973e-05', 'epoch': '0.3275', 'num_input_tokens_seen': 26625329, 'train_runtime': '1.347e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9188', 'grad_norm': '1.446', 'learning_rate': '4.973e-05', 'epoch': '0.3275', 'num_input_tokens_seen': 26627376, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6837', 'grad_norm': '1.245', 'learning_rate': '4.973e-05', 'epoch': '0.3275', 'num_input_tokens_seen': 26629423, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3682', 'grad_norm': '0.9142', 'learning_rate': '4.973e-05', 'epoch': '0.3276', 'num_input_tokens_seen': 26631470, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.056', 'grad_norm': '1.867', 'learning_rate': '4.973e-05', 'epoch': '0.3276', 'num_input_tokens_seen': 26633517, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3528', 'grad_norm': '0.9002', 'learning_rate': '4.973e-05', 'epoch': '0.3276', 'num_input_tokens_seen': 26635564, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.16', 'grad_norm': '2.05', 'learning_rate': '4.973e-05', 'epoch': '0.3276', 'num_input_tokens_seen': 26637611, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8883', 'grad_norm': '2.118', 'learning_rate': '4.973e-05', 'epoch': '0.3277', 'num_input_tokens_seen': 26639658, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1946', 'grad_norm': '0.681', 'learning_rate': '4.973e-05', 'epoch': '0.3277', 'num_input_tokens_seen': 26641705, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7574', 'grad_norm': '1.346', 'learning_rate': '4.973e-05', 'epoch': '0.3277', 'num_input_tokens_seen': 26643752, 'train_runtime': '1.348e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4113', 'grad_norm': '1.058', 'learning_rate': '4.973e-05', 'epoch': '0.3277', 'num_input_tokens_seen': 26645799, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4072', 'grad_norm': '1.194', 'learning_rate': '4.973e-05', 'epoch': '0.3278', 'num_input_tokens_seen': 26647846, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2515', 'grad_norm': '0.7268', 'learning_rate': '4.973e-05', 'epoch': '0.3278', 'num_input_tokens_seen': 26649893, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8003', 'grad_norm': '1.436', 'learning_rate': '4.973e-05', 'epoch': '0.3278', 'num_input_tokens_seen': 26651940, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4068', 'grad_norm': '0.9592', 'learning_rate': '4.973e-05', 'epoch': '0.3278', 'num_input_tokens_seen': 26653987, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6355', 'grad_norm': '1.85', 'learning_rate': '4.973e-05', 'epoch': '0.3279', 'num_input_tokens_seen': 26656034, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7764', 'grad_norm': '1.217', 'learning_rate': '4.973e-05', 'epoch': '0.3279', 'num_input_tokens_seen': 26658081, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4197', 'grad_norm': '1.03', 'learning_rate': '4.973e-05', 'epoch': '0.3279', 'num_input_tokens_seen': 26660128, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4925', 'grad_norm': '1.083', 'learning_rate': '4.973e-05', 'epoch': '0.3279', 'num_input_tokens_seen': 26662175, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.386', 'grad_norm': '2.465', 'learning_rate': '4.973e-05', 'epoch': '0.328', 'num_input_tokens_seen': 26664222, 'train_runtime': '1.349e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8319', 'grad_norm': '1.565', 'learning_rate': '4.973e-05', 'epoch': '0.328', 'num_input_tokens_seen': 26666269, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4714', 'grad_norm': '0.7952', 'learning_rate': '4.973e-05', 'epoch': '0.328', 'num_input_tokens_seen': 26668316, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3956', 'grad_norm': '1.037', 'learning_rate': '4.973e-05', 'epoch': '0.328', 'num_input_tokens_seen': 26670363, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.272', 'grad_norm': '0.9012', 'learning_rate': '4.973e-05', 'epoch': '0.3281', 'num_input_tokens_seen': 26672410, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8525', 'grad_norm': '1.482', 'learning_rate': '4.973e-05', 'epoch': '0.3281', 'num_input_tokens_seen': 26674457, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9791', 'grad_norm': '1.727', 'learning_rate': '4.973e-05', 'epoch': '0.3281', 'num_input_tokens_seen': 26676504, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2746', 'grad_norm': '0.8579', 'learning_rate': '4.973e-05', 'epoch': '0.3281', 'num_input_tokens_seen': 26678551, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2704', 'grad_norm': '0.7561', 'learning_rate': '4.973e-05', 'epoch': '0.3282', 'num_input_tokens_seen': 26680598, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3136', 'grad_norm': '0.8868', 'learning_rate': '4.973e-05', 'epoch': '0.3282', 'num_input_tokens_seen': 26682645, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2387', 'grad_norm': '0.7803', 'learning_rate': '4.973e-05', 'epoch': '0.3282', 'num_input_tokens_seen': 26684692, 'train_runtime': '1.35e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7336', 'grad_norm': '1.487', 'learning_rate': '4.973e-05', 'epoch': '0.3282', 'num_input_tokens_seen': 26686739, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5708', 'grad_norm': '1.21', 'learning_rate': '4.973e-05', 'epoch': '0.3283', 'num_input_tokens_seen': 26688786, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6089', 'grad_norm': '1.249', 'learning_rate': '4.973e-05', 'epoch': '0.3283', 'num_input_tokens_seen': 26690833, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5621', 'grad_norm': '1.55', 'learning_rate': '4.973e-05', 'epoch': '0.3283', 'num_input_tokens_seen': 26692880, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2526', 'grad_norm': '0.8136', 'learning_rate': '4.973e-05', 'epoch': '0.3283', 'num_input_tokens_seen': 26694927, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6845', 'grad_norm': '1.416', 'learning_rate': '4.973e-05', 'epoch': '0.3284', 'num_input_tokens_seen': 26696974, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6708', 'grad_norm': '1.3', 'learning_rate': '4.973e-05', 'epoch': '0.3284', 'num_input_tokens_seen': 26699021, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2236', 'grad_norm': '0.8662', 'learning_rate': '4.973e-05', 'epoch': '0.3284', 'num_input_tokens_seen': 26701068, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4344', 'grad_norm': '1.062', 'learning_rate': '4.973e-05', 'epoch': '0.3284', 'num_input_tokens_seen': 26703115, 'train_runtime': '1.351e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6429', 'grad_norm': '1.325', 'learning_rate': '4.973e-05', 'epoch': '0.3285', 'num_input_tokens_seen': 26705162, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.282', 'grad_norm': '2.067', 'learning_rate': '4.973e-05', 'epoch': '0.3285', 'num_input_tokens_seen': 26707209, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.304', 'grad_norm': '1.031', 'learning_rate': '4.973e-05', 'epoch': '0.3285', 'num_input_tokens_seen': 26709256, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8395', 'grad_norm': '1.638', 'learning_rate': '4.973e-05', 'epoch': '0.3285', 'num_input_tokens_seen': 26711303, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.646', 'grad_norm': '1.137', 'learning_rate': '4.973e-05', 'epoch': '0.3286', 'num_input_tokens_seen': 26713350, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7616', 'grad_norm': '1.324', 'learning_rate': '4.973e-05', 'epoch': '0.3286', 'num_input_tokens_seen': 26715397, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2359', 'grad_norm': '0.8398', 'learning_rate': '4.973e-05', 'epoch': '0.3286', 'num_input_tokens_seen': 26717444, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5357', 'grad_norm': '0.9335', 'learning_rate': '4.973e-05', 'epoch': '0.3287', 'num_input_tokens_seen': 26719491, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9047', 'grad_norm': '1.775', 'learning_rate': '4.973e-05', 'epoch': '0.3287', 'num_input_tokens_seen': 26721538, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2938', 'grad_norm': '1.037', 'learning_rate': '4.973e-05', 'epoch': '0.3287', 'num_input_tokens_seen': 26723585, 'train_runtime': '1.352e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8148', 'grad_norm': '1.423', 'learning_rate': '4.973e-05', 'epoch': '0.3287', 'num_input_tokens_seen': 26725632, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7057', 'grad_norm': '1.268', 'learning_rate': '4.973e-05', 'epoch': '0.3288', 'num_input_tokens_seen': 26727679, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.32', 'grad_norm': '2.163', 'learning_rate': '4.973e-05', 'epoch': '0.3288', 'num_input_tokens_seen': 26729726, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3888', 'grad_norm': '0.8888', 'learning_rate': '4.973e-05', 'epoch': '0.3288', 'num_input_tokens_seen': 26731773, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2839', 'grad_norm': '0.8313', 'learning_rate': '4.973e-05', 'epoch': '0.3288', 'num_input_tokens_seen': 26733820, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.883', 'grad_norm': '2.777', 'learning_rate': '4.973e-05', 'epoch': '0.3289', 'num_input_tokens_seen': 26735867, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9188', 'grad_norm': '1.461', 'learning_rate': '4.973e-05', 'epoch': '0.3289', 'num_input_tokens_seen': 26737914, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4595', 'grad_norm': '1.243', 'learning_rate': '4.973e-05', 'epoch': '0.3289', 'num_input_tokens_seen': 26739961, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3318', 'grad_norm': '0.9906', 'learning_rate': '4.973e-05', 'epoch': '0.3289', 'num_input_tokens_seen': 26742008, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7286', 'grad_norm': '1.273', 'learning_rate': '4.973e-05', 'epoch': '0.329', 'num_input_tokens_seen': 26744055, 'train_runtime': '1.353e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9014', 'grad_norm': '1.636', 'learning_rate': '4.973e-05', 'epoch': '0.329', 'num_input_tokens_seen': 26746102, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9952', 'grad_norm': '1.711', 'learning_rate': '4.973e-05', 'epoch': '0.329', 'num_input_tokens_seen': 26748149, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1983', 'grad_norm': '0.7505', 'learning_rate': '4.973e-05', 'epoch': '0.329', 'num_input_tokens_seen': 26750196, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6453', 'grad_norm': '1.232', 'learning_rate': '4.973e-05', 'epoch': '0.3291', 'num_input_tokens_seen': 26752243, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5889', 'grad_norm': '1.212', 'learning_rate': '4.973e-05', 'epoch': '0.3291', 'num_input_tokens_seen': 26754290, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6766', 'grad_norm': '1.384', 'learning_rate': '4.973e-05', 'epoch': '0.3291', 'num_input_tokens_seen': 26756337, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6709', 'grad_norm': '1.031', 'learning_rate': '4.973e-05', 'epoch': '0.3291', 'num_input_tokens_seen': 26758384, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4309', 'grad_norm': '0.9973', 'learning_rate': '4.973e-05', 'epoch': '0.3292', 'num_input_tokens_seen': 26760431, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4566', 'grad_norm': '1.319', 'learning_rate': '4.973e-05', 'epoch': '0.3292', 'num_input_tokens_seen': 26762478, 'train_runtime': '1.354e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5463', 'grad_norm': '1.421', 'learning_rate': '4.973e-05', 'epoch': '0.3292', 'num_input_tokens_seen': 26764525, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8335', 'grad_norm': '1.455', 'learning_rate': '4.973e-05', 'epoch': '0.3292', 'num_input_tokens_seen': 26766572, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6771', 'grad_norm': '1.314', 'learning_rate': '4.973e-05', 'epoch': '0.3293', 'num_input_tokens_seen': 26768619, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3866', 'grad_norm': '1.068', 'learning_rate': '4.973e-05', 'epoch': '0.3293', 'num_input_tokens_seen': 26770666, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2501', 'grad_norm': '0.8753', 'learning_rate': '4.973e-05', 'epoch': '0.3293', 'num_input_tokens_seen': 26772713, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.614', 'grad_norm': '0.9876', 'learning_rate': '4.973e-05', 'epoch': '0.3293', 'num_input_tokens_seen': 26774760, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1631', 'grad_norm': '0.7836', 'learning_rate': '4.973e-05', 'epoch': '0.3294', 'num_input_tokens_seen': 26776807, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.311', 'grad_norm': '1.889', 'learning_rate': '4.973e-05', 'epoch': '0.3294', 'num_input_tokens_seen': 26778854, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3708', 'grad_norm': '0.9633', 'learning_rate': '4.973e-05', 'epoch': '0.3294', 'num_input_tokens_seen': 26780901, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7497', 'grad_norm': '1.213', 'learning_rate': '4.973e-05', 'epoch': '0.3294', 'num_input_tokens_seen': 26782948, 'train_runtime': '1.355e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.489', 'grad_norm': '1.063', 'learning_rate': '4.973e-05', 'epoch': '0.3295', 'num_input_tokens_seen': 26784995, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7303', 'grad_norm': '1.444', 'learning_rate': '4.973e-05', 'epoch': '0.3295', 'num_input_tokens_seen': 26787042, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6881', 'grad_norm': '1.247', 'learning_rate': '4.973e-05', 'epoch': '0.3295', 'num_input_tokens_seen': 26789089, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.27', 'grad_norm': '2.374', 'learning_rate': '4.973e-05', 'epoch': '0.3295', 'num_input_tokens_seen': 26791136, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8507', 'grad_norm': '1.548', 'learning_rate': '4.973e-05', 'epoch': '0.3296', 'num_input_tokens_seen': 26793183, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.909', 'grad_norm': '1.574', 'learning_rate': '4.973e-05', 'epoch': '0.3296', 'num_input_tokens_seen': 26795230, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6736', 'grad_norm': '1.304', 'learning_rate': '4.973e-05', 'epoch': '0.3296', 'num_input_tokens_seen': 26797277, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.578', 'grad_norm': '2.209', 'learning_rate': '4.973e-05', 'epoch': '0.3296', 'num_input_tokens_seen': 26799324, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4218', 'grad_norm': '0.9992', 'learning_rate': '4.973e-05', 'epoch': '0.3297', 'num_input_tokens_seen': 26801371, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1548', 'grad_norm': '0.7281', 'learning_rate': '4.973e-05', 'epoch': '0.3297', 'num_input_tokens_seen': 26803418, 'train_runtime': '1.356e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3023', 'grad_norm': '0.9539', 'learning_rate': '4.973e-05', 'epoch': '0.3297', 'num_input_tokens_seen': 26805465, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8731', 'grad_norm': '1.764', 'learning_rate': '4.973e-05', 'epoch': '0.3297', 'num_input_tokens_seen': 26807512, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8729', 'grad_norm': '1.549', 'learning_rate': '4.973e-05', 'epoch': '0.3298', 'num_input_tokens_seen': 26809559, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5344', 'grad_norm': '1.023', 'learning_rate': '4.973e-05', 'epoch': '0.3298', 'num_input_tokens_seen': 26811606, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6958', 'grad_norm': '1.025', 'learning_rate': '4.973e-05', 'epoch': '0.3298', 'num_input_tokens_seen': 26813653, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.092', 'grad_norm': '3.105', 'learning_rate': '4.973e-05', 'epoch': '0.3298', 'num_input_tokens_seen': 26815700, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5155', 'grad_norm': '1.272', 'learning_rate': '4.973e-05', 'epoch': '0.3299', 'num_input_tokens_seen': 26817747, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7623', 'grad_norm': '1.552', 'learning_rate': '4.973e-05', 'epoch': '0.3299', 'num_input_tokens_seen': 26819794, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.201', 'grad_norm': '0.8525', 'learning_rate': '4.973e-05', 'epoch': '0.3299', 'num_input_tokens_seen': 26821841, 'train_runtime': '1.357e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6118', 'grad_norm': '0.899', 'learning_rate': '4.973e-05', 'epoch': '0.3299', 'num_input_tokens_seen': 26823888, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6523', 'grad_norm': '1.107', 'learning_rate': '4.973e-05', 'epoch': '0.33', 'num_input_tokens_seen': 26825935, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8704', 'grad_norm': '1.381', 'learning_rate': '4.973e-05', 'epoch': '0.33', 'num_input_tokens_seen': 26827982, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9629', 'grad_norm': '2.122', 'learning_rate': '4.973e-05', 'epoch': '0.33', 'num_input_tokens_seen': 26830029, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9948', 'grad_norm': '1.823', 'learning_rate': '4.973e-05', 'epoch': '0.33', 'num_input_tokens_seen': 26832076, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.048', 'grad_norm': '2.63', 'learning_rate': '4.973e-05', 'epoch': '0.3301', 'num_input_tokens_seen': 26834123, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.936', 'grad_norm': '1.66', 'learning_rate': '4.973e-05', 'epoch': '0.3301', 'num_input_tokens_seen': 26836170, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.336', 'grad_norm': '2.713', 'learning_rate': '4.973e-05', 'epoch': '0.3301', 'num_input_tokens_seen': 26838217, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8315', 'grad_norm': '1.251', 'learning_rate': '4.973e-05', 'epoch': '0.3301', 'num_input_tokens_seen': 26840264, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2373', 'grad_norm': '0.7845', 'learning_rate': '4.973e-05', 'epoch': '0.3302', 'num_input_tokens_seen': 26842311, 'train_runtime': '1.358e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3129', 'grad_norm': '0.8969', 'learning_rate': '4.973e-05', 'epoch': '0.3302', 'num_input_tokens_seen': 26844358, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7204', 'grad_norm': '1.221', 'learning_rate': '4.973e-05', 'epoch': '0.3302', 'num_input_tokens_seen': 26846405, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.42', 'grad_norm': '1.013', 'learning_rate': '4.973e-05', 'epoch': '0.3302', 'num_input_tokens_seen': 26848452, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.233', 'grad_norm': '2.258', 'learning_rate': '4.973e-05', 'epoch': '0.3303', 'num_input_tokens_seen': 26850499, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4273', 'grad_norm': '1.001', 'learning_rate': '4.973e-05', 'epoch': '0.3303', 'num_input_tokens_seen': 26852546, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.151', 'grad_norm': '1.691', 'learning_rate': '4.973e-05', 'epoch': '0.3303', 'num_input_tokens_seen': 26854593, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.056', 'grad_norm': '1.319', 'learning_rate': '4.973e-05', 'epoch': '0.3303', 'num_input_tokens_seen': 26856640, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5917', 'grad_norm': '1.113', 'learning_rate': '4.973e-05', 'epoch': '0.3304', 'num_input_tokens_seen': 26858687, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.159', 'grad_norm': '2.058', 'learning_rate': '4.973e-05', 'epoch': '0.3304', 'num_input_tokens_seen': 26860734, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4336', 'grad_norm': '1.07', 'learning_rate': '4.973e-05', 'epoch': '0.3304', 'num_input_tokens_seen': 26862781, 'train_runtime': '1.359e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8543', 'grad_norm': '1.203', 'learning_rate': '4.973e-05', 'epoch': '0.3304', 'num_input_tokens_seen': 26864828, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2998', 'grad_norm': '0.8527', 'learning_rate': '4.973e-05', 'epoch': '0.3305', 'num_input_tokens_seen': 26866875, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3324', 'grad_norm': '0.9783', 'learning_rate': '4.973e-05', 'epoch': '0.3305', 'num_input_tokens_seen': 26868922, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4859', 'grad_norm': '1.142', 'learning_rate': '4.973e-05', 'epoch': '0.3305', 'num_input_tokens_seen': 26870969, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3516', 'grad_norm': '0.7782', 'learning_rate': '4.973e-05', 'epoch': '0.3305', 'num_input_tokens_seen': 26873016, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8614', 'grad_norm': '1.372', 'learning_rate': '4.973e-05', 'epoch': '0.3306', 'num_input_tokens_seen': 26875063, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2977', 'grad_norm': '0.9706', 'learning_rate': '4.973e-05', 'epoch': '0.3306', 'num_input_tokens_seen': 26877110, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8347', 'grad_norm': '2.089', 'learning_rate': '4.973e-05', 'epoch': '0.3306', 'num_input_tokens_seen': 26879157, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.498', 'grad_norm': '1.349', 'learning_rate': '4.973e-05', 'epoch': '0.3306', 'num_input_tokens_seen': 26881204, 'train_runtime': '1.36e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6763', 'grad_norm': '1.698', 'learning_rate': '4.973e-05', 'epoch': '0.3307', 'num_input_tokens_seen': 26883251, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3975', 'grad_norm': '0.917', 'learning_rate': '4.973e-05', 'epoch': '0.3307', 'num_input_tokens_seen': 26885298, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2428', 'grad_norm': '0.8237', 'learning_rate': '4.973e-05', 'epoch': '0.3307', 'num_input_tokens_seen': 26887345, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4803', 'grad_norm': '1.026', 'learning_rate': '4.973e-05', 'epoch': '0.3307', 'num_input_tokens_seen': 26889392, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.387', 'grad_norm': '1.013', 'learning_rate': '4.973e-05', 'epoch': '0.3308', 'num_input_tokens_seen': 26891439, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.806', 'grad_norm': '3.412', 'learning_rate': '4.973e-05', 'epoch': '0.3308', 'num_input_tokens_seen': 26893486, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3194', 'grad_norm': '0.8285', 'learning_rate': '4.973e-05', 'epoch': '0.3308', 'num_input_tokens_seen': 26895533, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6503', 'grad_norm': '1.338', 'learning_rate': '4.973e-05', 'epoch': '0.3308', 'num_input_tokens_seen': 26897580, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5158', 'grad_norm': '1.032', 'learning_rate': '4.973e-05', 'epoch': '0.3309', 'num_input_tokens_seen': 26899627, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4376', 'grad_norm': '0.9117', 'learning_rate': '4.973e-05', 'epoch': '0.3309', 'num_input_tokens_seen': 26901674, 'train_runtime': '1.361e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.57', 'grad_norm': '2.62', 'learning_rate': '4.973e-05', 'epoch': '0.3309', 'num_input_tokens_seen': 26903721, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.243', 'grad_norm': '0.9171', 'learning_rate': '4.973e-05', 'epoch': '0.3309', 'num_input_tokens_seen': 26905768, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.3', 'grad_norm': '1.732', 'learning_rate': '4.973e-05', 'epoch': '0.331', 'num_input_tokens_seen': 26907815, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6859', 'grad_norm': '1.494', 'learning_rate': '4.973e-05', 'epoch': '0.331', 'num_input_tokens_seen': 26909862, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8202', 'grad_norm': '1.224', 'learning_rate': '4.973e-05', 'epoch': '0.331', 'num_input_tokens_seen': 26911909, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3343', 'grad_norm': '0.9289', 'learning_rate': '4.973e-05', 'epoch': '0.331', 'num_input_tokens_seen': 26913956, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6657', 'grad_norm': '0.9154', 'learning_rate': '4.973e-05', 'epoch': '0.3311', 'num_input_tokens_seen': 26916003, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9781', 'grad_norm': '1.456', 'learning_rate': '4.973e-05', 'epoch': '0.3311', 'num_input_tokens_seen': 26918050, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6171', 'grad_norm': '1.111', 'learning_rate': '4.973e-05', 'epoch': '0.3311', 'num_input_tokens_seen': 26920097, 'train_runtime': '1.362e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2743', 'grad_norm': '0.8241', 'learning_rate': '4.973e-05', 'epoch': '0.3311', 'num_input_tokens_seen': 26922144, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.338', 'grad_norm': '0.832', 'learning_rate': '4.973e-05', 'epoch': '0.3312', 'num_input_tokens_seen': 26924191, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2339', 'grad_norm': '0.8032', 'learning_rate': '4.973e-05', 'epoch': '0.3312', 'num_input_tokens_seen': 26926238, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8654', 'grad_norm': '1.468', 'learning_rate': '4.973e-05', 'epoch': '0.3312', 'num_input_tokens_seen': 26928285, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.573', 'grad_norm': '2.371', 'learning_rate': '4.973e-05', 'epoch': '0.3312', 'num_input_tokens_seen': 26930332, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4881', 'grad_norm': '1.251', 'learning_rate': '4.973e-05', 'epoch': '0.3313', 'num_input_tokens_seen': 26932379, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3929', 'grad_norm': '0.8617', 'learning_rate': '4.973e-05', 'epoch': '0.3313', 'num_input_tokens_seen': 26934426, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3947', 'grad_norm': '1.014', 'learning_rate': '4.973e-05', 'epoch': '0.3313', 'num_input_tokens_seen': 26936473, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3267', 'grad_norm': '1.044', 'learning_rate': '4.973e-05', 'epoch': '0.3313', 'num_input_tokens_seen': 26938520, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.438', 'grad_norm': '2.788', 'learning_rate': '4.973e-05', 'epoch': '0.3314', 'num_input_tokens_seen': 26940567, 'train_runtime': '1.363e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7683', 'grad_norm': '1.347', 'learning_rate': '4.973e-05', 'epoch': '0.3314', 'num_input_tokens_seen': 26942614, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3007', 'grad_norm': '0.8928', 'learning_rate': '4.973e-05', 'epoch': '0.3314', 'num_input_tokens_seen': 26944661, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5536', 'grad_norm': '1.17', 'learning_rate': '4.973e-05', 'epoch': '0.3314', 'num_input_tokens_seen': 26946708, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.294', 'grad_norm': '0.8023', 'learning_rate': '4.973e-05', 'epoch': '0.3315', 'num_input_tokens_seen': 26948755, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4962', 'grad_norm': '1.108', 'learning_rate': '4.973e-05', 'epoch': '0.3315', 'num_input_tokens_seen': 26950802, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.56', 'grad_norm': '0.9836', 'learning_rate': '4.973e-05', 'epoch': '0.3315', 'num_input_tokens_seen': 26952849, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.118', 'grad_norm': '2.046', 'learning_rate': '4.973e-05', 'epoch': '0.3315', 'num_input_tokens_seen': 26954896, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.11', 'grad_norm': '1.821', 'learning_rate': '4.973e-05', 'epoch': '0.3316', 'num_input_tokens_seen': 26956943, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.049', 'grad_norm': '1.828', 'learning_rate': '4.972e-05', 'epoch': '0.3316', 'num_input_tokens_seen': 26958990, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7731', 'grad_norm': '1.406', 'learning_rate': '4.972e-05', 'epoch': '0.3316', 'num_input_tokens_seen': 26961037, 'train_runtime': '1.364e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.272', 'grad_norm': '2.365', 'learning_rate': '4.972e-05', 'epoch': '0.3316', 'num_input_tokens_seen': 26963084, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.274', 'grad_norm': '0.869', 'learning_rate': '4.972e-05', 'epoch': '0.3317', 'num_input_tokens_seen': 26965131, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2631', 'grad_norm': '0.8104', 'learning_rate': '4.972e-05', 'epoch': '0.3317', 'num_input_tokens_seen': 26967178, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.118', 'grad_norm': '1.917', 'learning_rate': '4.972e-05', 'epoch': '0.3317', 'num_input_tokens_seen': 26969225, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.17', 'grad_norm': '2.319', 'learning_rate': '4.972e-05', 'epoch': '0.3317', 'num_input_tokens_seen': 26971272, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7546', 'grad_norm': '1.383', 'learning_rate': '4.972e-05', 'epoch': '0.3318', 'num_input_tokens_seen': 26973319, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2706', 'grad_norm': '0.8461', 'learning_rate': '4.972e-05', 'epoch': '0.3318', 'num_input_tokens_seen': 26975366, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2852', 'grad_norm': '0.8013', 'learning_rate': '4.972e-05', 'epoch': '0.3318', 'num_input_tokens_seen': 26977413, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2282', 'grad_norm': '0.8314', 'learning_rate': '4.972e-05', 'epoch': '0.3318', 'num_input_tokens_seen': 26979460, 'train_runtime': '1.365e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5759', 'grad_norm': '1.232', 'learning_rate': '4.972e-05', 'epoch': '0.3319', 'num_input_tokens_seen': 26981507, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4133', 'grad_norm': '1.443', 'learning_rate': '4.972e-05', 'epoch': '0.3319', 'num_input_tokens_seen': 26983554, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2081', 'grad_norm': '0.8204', 'learning_rate': '4.972e-05', 'epoch': '0.3319', 'num_input_tokens_seen': 26985601, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.369', 'grad_norm': '0.8151', 'learning_rate': '4.972e-05', 'epoch': '0.3319', 'num_input_tokens_seen': 26987648, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4851', 'grad_norm': '1.45', 'learning_rate': '4.972e-05', 'epoch': '0.332', 'num_input_tokens_seen': 26989695, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.297', 'grad_norm': '2.008', 'learning_rate': '4.972e-05', 'epoch': '0.332', 'num_input_tokens_seen': 26991742, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2966', 'grad_norm': '0.9512', 'learning_rate': '4.972e-05', 'epoch': '0.332', 'num_input_tokens_seen': 26993789, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6164', 'grad_norm': '1.337', 'learning_rate': '4.972e-05', 'epoch': '0.332', 'num_input_tokens_seen': 26995836, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8988', 'grad_norm': '1.813', 'learning_rate': '4.972e-05', 'epoch': '0.3321', 'num_input_tokens_seen': 26997883, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2738', 'grad_norm': '1.07', 'learning_rate': '4.972e-05', 'epoch': '0.3321', 'num_input_tokens_seen': 26999930, 'train_runtime': '1.366e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7974', 'grad_norm': '1.375', 'learning_rate': '4.972e-05', 'epoch': '0.3321', 'num_input_tokens_seen': 27001977, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5529', 'grad_norm': '1.101', 'learning_rate': '4.972e-05', 'epoch': '0.3321', 'num_input_tokens_seen': 27004024, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3389', 'grad_norm': '1.011', 'learning_rate': '4.972e-05', 'epoch': '0.3322', 'num_input_tokens_seen': 27006071, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2668', 'grad_norm': '0.8304', 'learning_rate': '4.972e-05', 'epoch': '0.3322', 'num_input_tokens_seen': 27008118, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4596', 'grad_norm': '1.303', 'learning_rate': '4.972e-05', 'epoch': '0.3322', 'num_input_tokens_seen': 27010165, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4831', 'grad_norm': '1.249', 'learning_rate': '4.972e-05', 'epoch': '0.3323', 'num_input_tokens_seen': 27012212, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5063', 'grad_norm': '1.015', 'learning_rate': '4.972e-05', 'epoch': '0.3323', 'num_input_tokens_seen': 27014259, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8053', 'grad_norm': '1.42', 'learning_rate': '4.972e-05', 'epoch': '0.3323', 'num_input_tokens_seen': 27016306, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2707', 'grad_norm': '1.001', 'learning_rate': '4.972e-05', 'epoch': '0.3323', 'num_input_tokens_seen': 27018353, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3049', 'grad_norm': '0.9786', 'learning_rate': '4.972e-05', 'epoch': '0.3324', 'num_input_tokens_seen': 27020400, 'train_runtime': '1.367e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.39', 'grad_norm': '2.02', 'learning_rate': '4.972e-05', 'epoch': '0.3324', 'num_input_tokens_seen': 27022447, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.225', 'grad_norm': '1.691', 'learning_rate': '4.972e-05', 'epoch': '0.3324', 'num_input_tokens_seen': 27024494, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.219', 'grad_norm': '2.235', 'learning_rate': '4.972e-05', 'epoch': '0.3324', 'num_input_tokens_seen': 27026541, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.009', 'grad_norm': '2.279', 'learning_rate': '4.972e-05', 'epoch': '0.3325', 'num_input_tokens_seen': 27028588, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3165', 'grad_norm': '0.9339', 'learning_rate': '4.972e-05', 'epoch': '0.3325', 'num_input_tokens_seen': 27030635, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.143', 'grad_norm': '1.756', 'learning_rate': '4.972e-05', 'epoch': '0.3325', 'num_input_tokens_seen': 27032682, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1742', 'grad_norm': '0.8947', 'learning_rate': '4.972e-05', 'epoch': '0.3325', 'num_input_tokens_seen': 27034729, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6861', 'grad_norm': '1.192', 'learning_rate': '4.972e-05', 'epoch': '0.3326', 'num_input_tokens_seen': 27036776, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7664', 'grad_norm': '1.079', 'learning_rate': '4.972e-05', 'epoch': '0.3326', 'num_input_tokens_seen': 27038823, 'train_runtime': '1.368e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2508', 'grad_norm': '0.9398', 'learning_rate': '4.972e-05', 'epoch': '0.3326', 'num_input_tokens_seen': 27040870, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9942', 'grad_norm': '1.402', 'learning_rate': '4.972e-05', 'epoch': '0.3326', 'num_input_tokens_seen': 27042917, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5158', 'grad_norm': '1.604', 'learning_rate': '4.972e-05', 'epoch': '0.3327', 'num_input_tokens_seen': 27044964, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.498', 'grad_norm': '2.786', 'learning_rate': '4.972e-05', 'epoch': '0.3327', 'num_input_tokens_seen': 27047011, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.012', 'grad_norm': '1.954', 'learning_rate': '4.972e-05', 'epoch': '0.3327', 'num_input_tokens_seen': 27049058, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5736', 'grad_norm': '1.119', 'learning_rate': '4.972e-05', 'epoch': '0.3327', 'num_input_tokens_seen': 27051105, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6193', 'grad_norm': '1.89', 'learning_rate': '4.972e-05', 'epoch': '0.3328', 'num_input_tokens_seen': 27053152, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.536', 'grad_norm': '1.239', 'learning_rate': '4.972e-05', 'epoch': '0.3328', 'num_input_tokens_seen': 27055199, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6206', 'grad_norm': '1.514', 'learning_rate': '4.972e-05', 'epoch': '0.3328', 'num_input_tokens_seen': 27057246, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2935', 'grad_norm': '0.8508', 'learning_rate': '4.972e-05', 'epoch': '0.3328', 'num_input_tokens_seen': 27059293, 'train_runtime': '1.369e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5108', 'grad_norm': '1.352', 'learning_rate': '4.972e-05', 'epoch': '0.3329', 'num_input_tokens_seen': 27061340, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.659', 'grad_norm': '2.715', 'learning_rate': '4.972e-05', 'epoch': '0.3329', 'num_input_tokens_seen': 27063387, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.219', 'grad_norm': '1.978', 'learning_rate': '4.972e-05', 'epoch': '0.3329', 'num_input_tokens_seen': 27065434, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8881', 'grad_norm': '1.27', 'learning_rate': '4.972e-05', 'epoch': '0.3329', 'num_input_tokens_seen': 27067481, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2771', 'grad_norm': '0.8812', 'learning_rate': '4.972e-05', 'epoch': '0.333', 'num_input_tokens_seen': 27069528, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.451', 'grad_norm': '2.785', 'learning_rate': '4.972e-05', 'epoch': '0.333', 'num_input_tokens_seen': 27071575, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6309', 'grad_norm': '1.294', 'learning_rate': '4.972e-05', 'epoch': '0.333', 'num_input_tokens_seen': 27073622, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.621', 'grad_norm': '1.367', 'learning_rate': '4.972e-05', 'epoch': '0.333', 'num_input_tokens_seen': 27075669, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.371', 'grad_norm': '1.997', 'learning_rate': '4.972e-05', 'epoch': '0.3331', 'num_input_tokens_seen': 27077716, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.758', 'grad_norm': '1.289', 'learning_rate': '4.972e-05', 'epoch': '0.3331', 'num_input_tokens_seen': 27079763, 'train_runtime': '1.37e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7407', 'grad_norm': '1.249', 'learning_rate': '4.972e-05', 'epoch': '0.3331', 'num_input_tokens_seen': 27081810, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.247', 'grad_norm': '2.014', 'learning_rate': '4.972e-05', 'epoch': '0.3331', 'num_input_tokens_seen': 27083857, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3896', 'grad_norm': '1.151', 'learning_rate': '4.972e-05', 'epoch': '0.3332', 'num_input_tokens_seen': 27085904, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6137', 'grad_norm': '1.855', 'learning_rate': '4.972e-05', 'epoch': '0.3332', 'num_input_tokens_seen': 27087951, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.578', 'grad_norm': '2.266', 'learning_rate': '4.972e-05', 'epoch': '0.3332', 'num_input_tokens_seen': 27089998, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3163', 'grad_norm': '0.9082', 'learning_rate': '4.972e-05', 'epoch': '0.3332', 'num_input_tokens_seen': 27092045, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4426', 'grad_norm': '0.8093', 'learning_rate': '4.972e-05', 'epoch': '0.3333', 'num_input_tokens_seen': 27094092, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4348', 'grad_norm': '0.964', 'learning_rate': '4.972e-05', 'epoch': '0.3333', 'num_input_tokens_seen': 27096139, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3041', 'grad_norm': '0.7458', 'learning_rate': '4.972e-05', 'epoch': '0.3333', 'num_input_tokens_seen': 27098186, 'train_runtime': '1.371e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8873', 'grad_norm': '1.993', 'learning_rate': '4.972e-05', 'epoch': '0.3333', 'num_input_tokens_seen': 27100233, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6054', 'grad_norm': '1.343', 'learning_rate': '4.972e-05', 'epoch': '0.3334', 'num_input_tokens_seen': 27102280, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7099', 'grad_norm': '1.278', 'learning_rate': '4.972e-05', 'epoch': '0.3334', 'num_input_tokens_seen': 27104327, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2456', 'grad_norm': '0.7732', 'learning_rate': '4.972e-05', 'epoch': '0.3334', 'num_input_tokens_seen': 27106374, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.018', 'grad_norm': '1.401', 'learning_rate': '4.972e-05', 'epoch': '0.3334', 'num_input_tokens_seen': 27108421, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.767', 'grad_norm': '2.428', 'learning_rate': '4.972e-05', 'epoch': '0.3335', 'num_input_tokens_seen': 27110468, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3745', 'grad_norm': '0.839', 'learning_rate': '4.972e-05', 'epoch': '0.3335', 'num_input_tokens_seen': 27112515, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.224', 'grad_norm': '1.73', 'learning_rate': '4.972e-05', 'epoch': '0.3335', 'num_input_tokens_seen': 27114562, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.266', 'grad_norm': '0.8899', 'learning_rate': '4.972e-05', 'epoch': '0.3335', 'num_input_tokens_seen': 27116609, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4434', 'grad_norm': '0.8667', 'learning_rate': '4.972e-05', 'epoch': '0.3336', 'num_input_tokens_seen': 27118656, 'train_runtime': '1.372e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4569', 'grad_norm': '0.8952', 'learning_rate': '4.972e-05', 'epoch': '0.3336', 'num_input_tokens_seen': 27120703, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.469', 'grad_norm': '1.184', 'learning_rate': '4.972e-05', 'epoch': '0.3336', 'num_input_tokens_seen': 27122750, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9541', 'grad_norm': '1.101', 'learning_rate': '4.972e-05', 'epoch': '0.3336', 'num_input_tokens_seen': 27124797, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2572', 'grad_norm': '0.7824', 'learning_rate': '4.972e-05', 'epoch': '0.3337', 'num_input_tokens_seen': 27126844, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.317', 'grad_norm': '1.806', 'learning_rate': '4.972e-05', 'epoch': '0.3337', 'num_input_tokens_seen': 27128891, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4245', 'grad_norm': '0.9687', 'learning_rate': '4.972e-05', 'epoch': '0.3337', 'num_input_tokens_seen': 27130938, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7972', 'grad_norm': '1.422', 'learning_rate': '4.972e-05', 'epoch': '0.3337', 'num_input_tokens_seen': 27132985, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6212', 'grad_norm': '1.099', 'learning_rate': '4.972e-05', 'epoch': '0.3338', 'num_input_tokens_seen': 27135032, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2563', 'grad_norm': '0.9094', 'learning_rate': '4.972e-05', 'epoch': '0.3338', 'num_input_tokens_seen': 27137079, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3295', 'grad_norm': '0.7993', 'learning_rate': '4.972e-05', 'epoch': '0.3338', 'num_input_tokens_seen': 27139126, 'train_runtime': '1.373e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5439', 'grad_norm': '1.057', 'learning_rate': '4.972e-05', 'epoch': '0.3338', 'num_input_tokens_seen': 27141173, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3258', 'grad_norm': '0.965', 'learning_rate': '4.972e-05', 'epoch': '0.3339', 'num_input_tokens_seen': 27143220, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6963', 'grad_norm': '1.197', 'learning_rate': '4.972e-05', 'epoch': '0.3339', 'num_input_tokens_seen': 27145267, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.253', 'grad_norm': '1.917', 'learning_rate': '4.972e-05', 'epoch': '0.3339', 'num_input_tokens_seen': 27147314, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8691', 'grad_norm': '1.899', 'learning_rate': '4.972e-05', 'epoch': '0.3339', 'num_input_tokens_seen': 27149361, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4706', 'grad_norm': '1.01', 'learning_rate': '4.972e-05', 'epoch': '0.334', 'num_input_tokens_seen': 27151408, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9171', 'grad_norm': '1.048', 'learning_rate': '4.972e-05', 'epoch': '0.334', 'num_input_tokens_seen': 27153455, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3349', 'grad_norm': '0.8858', 'learning_rate': '4.972e-05', 'epoch': '0.334', 'num_input_tokens_seen': 27155502, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.176', 'grad_norm': '1.742', 'learning_rate': '4.972e-05', 'epoch': '0.334', 'num_input_tokens_seen': 27157549, 'train_runtime': '1.374e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2206', 'grad_norm': '0.7395', 'learning_rate': '4.972e-05', 'epoch': '0.3341', 'num_input_tokens_seen': 27159596, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5493', 'grad_norm': '1.758', 'learning_rate': '4.972e-05', 'epoch': '0.3341', 'num_input_tokens_seen': 27161643, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6224', 'grad_norm': '1.302', 'learning_rate': '4.972e-05', 'epoch': '0.3341', 'num_input_tokens_seen': 27163690, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6197', 'grad_norm': '1.279', 'learning_rate': '4.972e-05', 'epoch': '0.3341', 'num_input_tokens_seen': 27165737, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6432', 'grad_norm': '1.41', 'learning_rate': '4.972e-05', 'epoch': '0.3342', 'num_input_tokens_seen': 27167784, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.11', 'grad_norm': '1.813', 'learning_rate': '4.972e-05', 'epoch': '0.3342', 'num_input_tokens_seen': 27169831, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3032', 'grad_norm': '0.8101', 'learning_rate': '4.972e-05', 'epoch': '0.3342', 'num_input_tokens_seen': 27171878, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8483', 'grad_norm': '2.07', 'learning_rate': '4.972e-05', 'epoch': '0.3342', 'num_input_tokens_seen': 27173925, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3515', 'grad_norm': '0.9234', 'learning_rate': '4.972e-05', 'epoch': '0.3343', 'num_input_tokens_seen': 27175972, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.296', 'grad_norm': '1.015', 'learning_rate': '4.972e-05', 'epoch': '0.3343', 'num_input_tokens_seen': 27178019, 'train_runtime': '1.375e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6124', 'grad_norm': '0.982', 'learning_rate': '4.972e-05', 'epoch': '0.3343', 'num_input_tokens_seen': 27180066, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.478', 'grad_norm': '2.729', 'learning_rate': '4.972e-05', 'epoch': '0.3343', 'num_input_tokens_seen': 27182113, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5073', 'grad_norm': '1.139', 'learning_rate': '4.972e-05', 'epoch': '0.3344', 'num_input_tokens_seen': 27184160, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.35', 'grad_norm': '0.8674', 'learning_rate': '4.972e-05', 'epoch': '0.3344', 'num_input_tokens_seen': 27186207, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.055', 'grad_norm': '1.255', 'learning_rate': '4.972e-05', 'epoch': '0.3344', 'num_input_tokens_seen': 27188254, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.662', 'grad_norm': '1.332', 'learning_rate': '4.972e-05', 'epoch': '0.3344', 'num_input_tokens_seen': 27190301, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6328', 'grad_norm': '1.247', 'learning_rate': '4.972e-05', 'epoch': '0.3345', 'num_input_tokens_seen': 27192348, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6997', 'grad_norm': '1.238', 'learning_rate': '4.972e-05', 'epoch': '0.3345', 'num_input_tokens_seen': 27194395, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3501', 'grad_norm': '1.639', 'learning_rate': '4.972e-05', 'epoch': '0.3345', 'num_input_tokens_seen': 27196442, 'train_runtime': '1.376e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.297', 'grad_norm': '0.8685', 'learning_rate': '4.972e-05', 'epoch': '0.3345', 'num_input_tokens_seen': 27198489, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7399', 'grad_norm': '1.094', 'learning_rate': '4.972e-05', 'epoch': '0.3346', 'num_input_tokens_seen': 27200536, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3277', 'grad_norm': '0.8691', 'learning_rate': '4.972e-05', 'epoch': '0.3346', 'num_input_tokens_seen': 27202583, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5096', 'grad_norm': '0.8662', 'learning_rate': '4.972e-05', 'epoch': '0.3346', 'num_input_tokens_seen': 27204630, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5501', 'grad_norm': '1.501', 'learning_rate': '4.972e-05', 'epoch': '0.3346', 'num_input_tokens_seen': 27206677, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.208', 'grad_norm': '2.811', 'learning_rate': '4.972e-05', 'epoch': '0.3347', 'num_input_tokens_seen': 27208724, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3276', 'grad_norm': '0.9638', 'learning_rate': '4.972e-05', 'epoch': '0.3347', 'num_input_tokens_seen': 27210771, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9621', 'grad_norm': '1.575', 'learning_rate': '4.972e-05', 'epoch': '0.3347', 'num_input_tokens_seen': 27212818, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.503', 'grad_norm': '2.278', 'learning_rate': '4.972e-05', 'epoch': '0.3347', 'num_input_tokens_seen': 27214865, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.354', 'grad_norm': '0.8762', 'learning_rate': '4.972e-05', 'epoch': '0.3348', 'num_input_tokens_seen': 27216912, 'train_runtime': '1.377e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3404', 'grad_norm': '0.8009', 'learning_rate': '4.972e-05', 'epoch': '0.3348', 'num_input_tokens_seen': 27218959, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.015', 'grad_norm': '2.108', 'learning_rate': '4.972e-05', 'epoch': '0.3348', 'num_input_tokens_seen': 27221006, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4235', 'grad_norm': '1.153', 'learning_rate': '4.972e-05', 'epoch': '0.3348', 'num_input_tokens_seen': 27223053, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.746', 'grad_norm': '2.78', 'learning_rate': '4.972e-05', 'epoch': '0.3349', 'num_input_tokens_seen': 27225100, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3206', 'grad_norm': '0.7688', 'learning_rate': '4.972e-05', 'epoch': '0.3349', 'num_input_tokens_seen': 27227147, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4964', 'grad_norm': '1.286', 'learning_rate': '4.972e-05', 'epoch': '0.3349', 'num_input_tokens_seen': 27229194, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5035', 'grad_norm': '1.176', 'learning_rate': '4.972e-05', 'epoch': '0.3349', 'num_input_tokens_seen': 27231241, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3373', 'grad_norm': '0.8743', 'learning_rate': '4.972e-05', 'epoch': '0.335', 'num_input_tokens_seen': 27233288, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7397', 'grad_norm': '1.263', 'learning_rate': '4.972e-05', 'epoch': '0.335', 'num_input_tokens_seen': 27235335, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9071', 'grad_norm': '1.27', 'learning_rate': '4.972e-05', 'epoch': '0.335', 'num_input_tokens_seen': 27237382, 'train_runtime': '1.378e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5906', 'grad_norm': '1.098', 'learning_rate': '4.972e-05', 'epoch': '0.335', 'num_input_tokens_seen': 27239429, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1941', 'grad_norm': '0.7961', 'learning_rate': '4.972e-05', 'epoch': '0.3351', 'num_input_tokens_seen': 27241476, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2768', 'grad_norm': '0.7872', 'learning_rate': '4.972e-05', 'epoch': '0.3351', 'num_input_tokens_seen': 27243523, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4342', 'grad_norm': '1.256', 'learning_rate': '4.972e-05', 'epoch': '0.3351', 'num_input_tokens_seen': 27245570, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7998', 'grad_norm': '1.287', 'learning_rate': '4.972e-05', 'epoch': '0.3351', 'num_input_tokens_seen': 27247617, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7074', 'grad_norm': '1.317', 'learning_rate': '4.972e-05', 'epoch': '0.3352', 'num_input_tokens_seen': 27249664, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4501', 'grad_norm': '0.9199', 'learning_rate': '4.972e-05', 'epoch': '0.3352', 'num_input_tokens_seen': 27251711, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.206', 'grad_norm': '1.164', 'learning_rate': '4.972e-05', 'epoch': '0.3352', 'num_input_tokens_seen': 27253758, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.012', 'grad_norm': '1.956', 'learning_rate': '4.972e-05', 'epoch': '0.3352', 'num_input_tokens_seen': 27255805, 'train_runtime': '1.379e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9845', 'grad_norm': '1.821', 'learning_rate': '4.972e-05', 'epoch': '0.3353', 'num_input_tokens_seen': 27257852, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5051', 'grad_norm': '0.996', 'learning_rate': '4.972e-05', 'epoch': '0.3353', 'num_input_tokens_seen': 27259899, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.615', 'grad_norm': '2.598', 'learning_rate': '4.972e-05', 'epoch': '0.3353', 'num_input_tokens_seen': 27261946, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.062', 'grad_norm': '1.781', 'learning_rate': '4.972e-05', 'epoch': '0.3353', 'num_input_tokens_seen': 27263993, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7571', 'grad_norm': '0.9411', 'learning_rate': '4.972e-05', 'epoch': '0.3354', 'num_input_tokens_seen': 27266040, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.05', 'grad_norm': '1.662', 'learning_rate': '4.972e-05', 'epoch': '0.3354', 'num_input_tokens_seen': 27268087, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6615', 'grad_norm': '1.154', 'learning_rate': '4.972e-05', 'epoch': '0.3354', 'num_input_tokens_seen': 27270134, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3974', 'grad_norm': '0.9968', 'learning_rate': '4.972e-05', 'epoch': '0.3354', 'num_input_tokens_seen': 27272181, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3198', 'grad_norm': '0.8725', 'learning_rate': '4.972e-05', 'epoch': '0.3355', 'num_input_tokens_seen': 27274228, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2361', 'grad_norm': '0.7424', 'learning_rate': '4.972e-05', 'epoch': '0.3355', 'num_input_tokens_seen': 27276275, 'train_runtime': '1.38e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3033', 'grad_norm': '0.8473', 'learning_rate': '4.972e-05', 'epoch': '0.3355', 'num_input_tokens_seen': 27278322, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.439', 'grad_norm': '2.378', 'learning_rate': '4.972e-05', 'epoch': '0.3355', 'num_input_tokens_seen': 27280369, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6546', 'grad_norm': '1.275', 'learning_rate': '4.972e-05', 'epoch': '0.3356', 'num_input_tokens_seen': 27282416, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.415', 'grad_norm': '1.452', 'learning_rate': '4.972e-05', 'epoch': '0.3356', 'num_input_tokens_seen': 27284463, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2584', 'grad_norm': '0.9021', 'learning_rate': '4.972e-05', 'epoch': '0.3356', 'num_input_tokens_seen': 27286510, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5007', 'grad_norm': '1.212', 'learning_rate': '4.972e-05', 'epoch': '0.3356', 'num_input_tokens_seen': 27288557, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4627', 'grad_norm': '1.075', 'learning_rate': '4.972e-05', 'epoch': '0.3357', 'num_input_tokens_seen': 27290604, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8804', 'grad_norm': '1.394', 'learning_rate': '4.972e-05', 'epoch': '0.3357', 'num_input_tokens_seen': 27292651, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2145', 'grad_norm': '0.7543', 'learning_rate': '4.972e-05', 'epoch': '0.3357', 'num_input_tokens_seen': 27294698, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3932', 'grad_norm': '1.03', 'learning_rate': '4.972e-05', 'epoch': '0.3358', 'num_input_tokens_seen': 27296745, 'train_runtime': '1.381e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3471', 'grad_norm': '1.013', 'learning_rate': '4.971e-05', 'epoch': '0.3358', 'num_input_tokens_seen': 27298792, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3402', 'grad_norm': '0.8515', 'learning_rate': '4.971e-05', 'epoch': '0.3358', 'num_input_tokens_seen': 27300839, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9459', 'grad_norm': '1.572', 'learning_rate': '4.971e-05', 'epoch': '0.3358', 'num_input_tokens_seen': 27302886, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2714', 'grad_norm': '0.9292', 'learning_rate': '4.971e-05', 'epoch': '0.3359', 'num_input_tokens_seen': 27304933, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4998', 'grad_norm': '1.414', 'learning_rate': '4.971e-05', 'epoch': '0.3359', 'num_input_tokens_seen': 27306980, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4754', 'grad_norm': '1.057', 'learning_rate': '4.971e-05', 'epoch': '0.3359', 'num_input_tokens_seen': 27309027, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1846', 'grad_norm': '0.7965', 'learning_rate': '4.971e-05', 'epoch': '0.3359', 'num_input_tokens_seen': 27311074, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5698', 'grad_norm': '1.249', 'learning_rate': '4.971e-05', 'epoch': '0.336', 'num_input_tokens_seen': 27313121, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4684', 'grad_norm': '0.9677', 'learning_rate': '4.971e-05', 'epoch': '0.336', 'num_input_tokens_seen': 27315168, 'train_runtime': '1.382e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4322', 'grad_norm': '1.126', 'learning_rate': '4.971e-05', 'epoch': '0.336', 'num_input_tokens_seen': 27317215, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4489', 'grad_norm': '0.9721', 'learning_rate': '4.971e-05', 'epoch': '0.336', 'num_input_tokens_seen': 27319262, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.134', 'grad_norm': '2.052', 'learning_rate': '4.971e-05', 'epoch': '0.3361', 'num_input_tokens_seen': 27321309, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8355', 'grad_norm': '2.1', 'learning_rate': '4.971e-05', 'epoch': '0.3361', 'num_input_tokens_seen': 27323356, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2175', 'grad_norm': '0.8337', 'learning_rate': '4.971e-05', 'epoch': '0.3361', 'num_input_tokens_seen': 27325403, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9171', 'grad_norm': '1.838', 'learning_rate': '4.971e-05', 'epoch': '0.3361', 'num_input_tokens_seen': 27327450, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.355', 'grad_norm': '0.8843', 'learning_rate': '4.971e-05', 'epoch': '0.3362', 'num_input_tokens_seen': 27329497, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5246', 'grad_norm': '1.005', 'learning_rate': '4.971e-05', 'epoch': '0.3362', 'num_input_tokens_seen': 27331544, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1375', 'grad_norm': '0.6556', 'learning_rate': '4.971e-05', 'epoch': '0.3362', 'num_input_tokens_seen': 27333591, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7345', 'grad_norm': '1.478', 'learning_rate': '4.971e-05', 'epoch': '0.3362', 'num_input_tokens_seen': 27335638, 'train_runtime': '1.383e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8607', 'grad_norm': '1.796', 'learning_rate': '4.971e-05', 'epoch': '0.3363', 'num_input_tokens_seen': 27337685, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.484', 'grad_norm': '1.292', 'learning_rate': '4.971e-05', 'epoch': '0.3363', 'num_input_tokens_seen': 27339732, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4569', 'grad_norm': '0.947', 'learning_rate': '4.971e-05', 'epoch': '0.3363', 'num_input_tokens_seen': 27341779, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.079', 'grad_norm': '1.903', 'learning_rate': '4.971e-05', 'epoch': '0.3363', 'num_input_tokens_seen': 27343826, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4938', 'grad_norm': '1.104', 'learning_rate': '4.971e-05', 'epoch': '0.3364', 'num_input_tokens_seen': 27345873, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.792', 'grad_norm': '1.199', 'learning_rate': '4.971e-05', 'epoch': '0.3364', 'num_input_tokens_seen': 27347920, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3728', 'grad_norm': '0.994', 'learning_rate': '4.971e-05', 'epoch': '0.3364', 'num_input_tokens_seen': 27349967, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.646', 'grad_norm': '2.944', 'learning_rate': '4.971e-05', 'epoch': '0.3364', 'num_input_tokens_seen': 27352014, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9822', 'grad_norm': '1.072', 'learning_rate': '4.971e-05', 'epoch': '0.3365', 'num_input_tokens_seen': 27354061, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4098', 'grad_norm': '0.9572', 'learning_rate': '4.971e-05', 'epoch': '0.3365', 'num_input_tokens_seen': 27356108, 'train_runtime': '1.384e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.276', 'grad_norm': '2.392', 'learning_rate': '4.971e-05', 'epoch': '0.3365', 'num_input_tokens_seen': 27358155, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9431', 'grad_norm': '1.425', 'learning_rate': '4.971e-05', 'epoch': '0.3365', 'num_input_tokens_seen': 27360202, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4797', 'grad_norm': '1.14', 'learning_rate': '4.971e-05', 'epoch': '0.3366', 'num_input_tokens_seen': 27362249, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.105', 'grad_norm': '2.269', 'learning_rate': '4.971e-05', 'epoch': '0.3366', 'num_input_tokens_seen': 27364296, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.904', 'grad_norm': '2.037', 'learning_rate': '4.971e-05', 'epoch': '0.3366', 'num_input_tokens_seen': 27366343, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8072', 'grad_norm': '1.049', 'learning_rate': '4.971e-05', 'epoch': '0.3366', 'num_input_tokens_seen': 27368390, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.722', 'grad_norm': '2.853', 'learning_rate': '4.971e-05', 'epoch': '0.3367', 'num_input_tokens_seen': 27370437, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5173', 'grad_norm': '1.196', 'learning_rate': '4.971e-05', 'epoch': '0.3367', 'num_input_tokens_seen': 27372484, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4098', 'grad_norm': '1.07', 'learning_rate': '4.971e-05', 'epoch': '0.3367', 'num_input_tokens_seen': 27374531, 'train_runtime': '1.385e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4765', 'grad_norm': '1.176', 'learning_rate': '4.971e-05', 'epoch': '0.3367', 'num_input_tokens_seen': 27376578, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9033', 'grad_norm': '1.78', 'learning_rate': '4.971e-05', 'epoch': '0.3368', 'num_input_tokens_seen': 27378625, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2655', 'grad_norm': '0.7804', 'learning_rate': '4.971e-05', 'epoch': '0.3368', 'num_input_tokens_seen': 27380672, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7363', 'grad_norm': '1.092', 'learning_rate': '4.971e-05', 'epoch': '0.3368', 'num_input_tokens_seen': 27382719, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.426', 'grad_norm': '2.084', 'learning_rate': '4.971e-05', 'epoch': '0.3368', 'num_input_tokens_seen': 27384766, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.893', 'grad_norm': '1.892', 'learning_rate': '4.971e-05', 'epoch': '0.3369', 'num_input_tokens_seen': 27386813, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9368', 'grad_norm': '1.331', 'learning_rate': '4.971e-05', 'epoch': '0.3369', 'num_input_tokens_seen': 27388860, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2762', 'grad_norm': '0.8071', 'learning_rate': '4.971e-05', 'epoch': '0.3369', 'num_input_tokens_seen': 27390907, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.075', 'grad_norm': '1.905', 'learning_rate': '4.971e-05', 'epoch': '0.3369', 'num_input_tokens_seen': 27392954, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.393', 'grad_norm': '0.889', 'learning_rate': '4.971e-05', 'epoch': '0.337', 'num_input_tokens_seen': 27395001, 'train_runtime': '1.386e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.167', 'grad_norm': '1.685', 'learning_rate': '4.971e-05', 'epoch': '0.337', 'num_input_tokens_seen': 27397048, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6863', 'grad_norm': '1.184', 'learning_rate': '4.971e-05', 'epoch': '0.337', 'num_input_tokens_seen': 27399095, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.518', 'grad_norm': '2.553', 'learning_rate': '4.971e-05', 'epoch': '0.337', 'num_input_tokens_seen': 27401142, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7602', 'grad_norm': '1.305', 'learning_rate': '4.971e-05', 'epoch': '0.3371', 'num_input_tokens_seen': 27403189, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.013', 'grad_norm': '1.479', 'learning_rate': '4.971e-05', 'epoch': '0.3371', 'num_input_tokens_seen': 27405236, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3141', 'grad_norm': '0.9247', 'learning_rate': '4.971e-05', 'epoch': '0.3371', 'num_input_tokens_seen': 27407283, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7237', 'grad_norm': '1.285', 'learning_rate': '4.971e-05', 'epoch': '0.3371', 'num_input_tokens_seen': 27409330, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5274', 'grad_norm': '1.272', 'learning_rate': '4.971e-05', 'epoch': '0.3372', 'num_input_tokens_seen': 27411377, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4778', 'grad_norm': '1.714', 'learning_rate': '4.971e-05', 'epoch': '0.3372', 'num_input_tokens_seen': 27413424, 'train_runtime': '1.387e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.305', 'grad_norm': '0.7848', 'learning_rate': '4.971e-05', 'epoch': '0.3372', 'num_input_tokens_seen': 27415471, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3324', 'grad_norm': '1.109', 'learning_rate': '4.971e-05', 'epoch': '0.3372', 'num_input_tokens_seen': 27417518, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3814', 'grad_norm': '0.8552', 'learning_rate': '4.971e-05', 'epoch': '0.3373', 'num_input_tokens_seen': 27419565, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3049', 'grad_norm': '1.114', 'learning_rate': '4.971e-05', 'epoch': '0.3373', 'num_input_tokens_seen': 27421612, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6359', 'grad_norm': '1.296', 'learning_rate': '4.971e-05', 'epoch': '0.3373', 'num_input_tokens_seen': 27423659, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.104', 'grad_norm': '1.454', 'learning_rate': '4.971e-05', 'epoch': '0.3373', 'num_input_tokens_seen': 27425706, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4249', 'grad_norm': '0.934', 'learning_rate': '4.971e-05', 'epoch': '0.3374', 'num_input_tokens_seen': 27427753, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7319', 'grad_norm': '1.283', 'learning_rate': '4.971e-05', 'epoch': '0.3374', 'num_input_tokens_seen': 27429800, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5558', 'grad_norm': '1.146', 'learning_rate': '4.971e-05', 'epoch': '0.3374', 'num_input_tokens_seen': 27431847, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7126', 'grad_norm': '1.395', 'learning_rate': '4.971e-05', 'epoch': '0.3374', 'num_input_tokens_seen': 27433894, 'train_runtime': '1.388e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3019', 'grad_norm': '0.8273', 'learning_rate': '4.971e-05', 'epoch': '0.3375', 'num_input_tokens_seen': 27435941, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.141', 'grad_norm': '2.092', 'learning_rate': '4.971e-05', 'epoch': '0.3375', 'num_input_tokens_seen': 27437988, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9726', 'grad_norm': '1.7', 'learning_rate': '4.971e-05', 'epoch': '0.3375', 'num_input_tokens_seen': 27440035, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3289', 'grad_norm': '1.281', 'learning_rate': '4.971e-05', 'epoch': '0.3375', 'num_input_tokens_seen': 27442082, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4677', 'grad_norm': '1.001', 'learning_rate': '4.971e-05', 'epoch': '0.3376', 'num_input_tokens_seen': 27444129, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7996', 'grad_norm': '1.442', 'learning_rate': '4.971e-05', 'epoch': '0.3376', 'num_input_tokens_seen': 27446176, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.027', 'grad_norm': '1.436', 'learning_rate': '4.971e-05', 'epoch': '0.3376', 'num_input_tokens_seen': 27448223, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.91', 'grad_norm': '2.786', 'learning_rate': '4.971e-05', 'epoch': '0.3376', 'num_input_tokens_seen': 27450270, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.686', 'grad_norm': '2.99', 'learning_rate': '4.971e-05', 'epoch': '0.3377', 'num_input_tokens_seen': 27452317, 'train_runtime': '1.389e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5512', 'grad_norm': '1.173', 'learning_rate': '4.971e-05', 'epoch': '0.3377', 'num_input_tokens_seen': 27454364, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.075', 'grad_norm': '1.807', 'learning_rate': '4.971e-05', 'epoch': '0.3377', 'num_input_tokens_seen': 27456411, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6879', 'grad_norm': '1.256', 'learning_rate': '4.971e-05', 'epoch': '0.3377', 'num_input_tokens_seen': 27458458, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3528', 'grad_norm': '1.128', 'learning_rate': '4.971e-05', 'epoch': '0.3378', 'num_input_tokens_seen': 27460505, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5085', 'grad_norm': '1.015', 'learning_rate': '4.971e-05', 'epoch': '0.3378', 'num_input_tokens_seen': 27462552, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8778', 'grad_norm': '1.432', 'learning_rate': '4.971e-05', 'epoch': '0.3378', 'num_input_tokens_seen': 27464599, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.616', 'grad_norm': '1.229', 'learning_rate': '4.971e-05', 'epoch': '0.3378', 'num_input_tokens_seen': 27466646, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5122', 'grad_norm': '0.9741', 'learning_rate': '4.971e-05', 'epoch': '0.3379', 'num_input_tokens_seen': 27468693, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.278', 'grad_norm': '0.7859', 'learning_rate': '4.971e-05', 'epoch': '0.3379', 'num_input_tokens_seen': 27470740, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5829', 'grad_norm': '1.276', 'learning_rate': '4.971e-05', 'epoch': '0.3379', 'num_input_tokens_seen': 27472787, 'train_runtime': '1.39e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2389', 'grad_norm': '0.7883', 'learning_rate': '4.971e-05', 'epoch': '0.3379', 'num_input_tokens_seen': 27474834, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.673', 'grad_norm': '1.177', 'learning_rate': '4.971e-05', 'epoch': '0.338', 'num_input_tokens_seen': 27476881, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2447', 'grad_norm': '0.9097', 'learning_rate': '4.971e-05', 'epoch': '0.338', 'num_input_tokens_seen': 27478928, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4134', 'grad_norm': '0.877', 'learning_rate': '4.971e-05', 'epoch': '0.338', 'num_input_tokens_seen': 27480975, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2065', 'grad_norm': '0.7775', 'learning_rate': '4.971e-05', 'epoch': '0.338', 'num_input_tokens_seen': 27483022, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4002', 'grad_norm': '0.9586', 'learning_rate': '4.971e-05', 'epoch': '0.3381', 'num_input_tokens_seen': 27485069, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7052', 'grad_norm': '1.178', 'learning_rate': '4.971e-05', 'epoch': '0.3381', 'num_input_tokens_seen': 27487116, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4132', 'grad_norm': '1.133', 'learning_rate': '4.971e-05', 'epoch': '0.3381', 'num_input_tokens_seen': 27489163, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6179', 'grad_norm': '0.976', 'learning_rate': '4.971e-05', 'epoch': '0.3381', 'num_input_tokens_seen': 27491210, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7477', 'grad_norm': '1.356', 'learning_rate': '4.971e-05', 'epoch': '0.3382', 'num_input_tokens_seen': 27493257, 'train_runtime': '1.391e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5711', 'grad_norm': '1.229', 'learning_rate': '4.971e-05', 'epoch': '0.3382', 'num_input_tokens_seen': 27495304, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.531', 'grad_norm': '2.716', 'learning_rate': '4.971e-05', 'epoch': '0.3382', 'num_input_tokens_seen': 27497351, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4193', 'grad_norm': '0.8613', 'learning_rate': '4.971e-05', 'epoch': '0.3382', 'num_input_tokens_seen': 27499398, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3485', 'grad_norm': '0.9252', 'learning_rate': '4.971e-05', 'epoch': '0.3383', 'num_input_tokens_seen': 27501445, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.285', 'grad_norm': '1.043', 'learning_rate': '4.971e-05', 'epoch': '0.3383', 'num_input_tokens_seen': 27503492, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3363', 'grad_norm': '0.7599', 'learning_rate': '4.971e-05', 'epoch': '0.3383', 'num_input_tokens_seen': 27505539, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5403', 'grad_norm': '1.328', 'learning_rate': '4.971e-05', 'epoch': '0.3383', 'num_input_tokens_seen': 27507586, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.469', 'grad_norm': '1.439', 'learning_rate': '4.971e-05', 'epoch': '0.3384', 'num_input_tokens_seen': 27509633, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6076', 'grad_norm': '1.43', 'learning_rate': '4.971e-05', 'epoch': '0.3384', 'num_input_tokens_seen': 27511680, 'train_runtime': '1.392e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7941', 'grad_norm': '1.219', 'learning_rate': '4.971e-05', 'epoch': '0.3384', 'num_input_tokens_seen': 27513727, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5734', 'grad_norm': '1.589', 'learning_rate': '4.971e-05', 'epoch': '0.3384', 'num_input_tokens_seen': 27515774, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.28', 'grad_norm': '1.629', 'learning_rate': '4.971e-05', 'epoch': '0.3385', 'num_input_tokens_seen': 27517821, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2824', 'grad_norm': '0.9115', 'learning_rate': '4.971e-05', 'epoch': '0.3385', 'num_input_tokens_seen': 27519868, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4112', 'grad_norm': '0.9919', 'learning_rate': '4.971e-05', 'epoch': '0.3385', 'num_input_tokens_seen': 27521915, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2325', 'grad_norm': '0.742', 'learning_rate': '4.971e-05', 'epoch': '0.3385', 'num_input_tokens_seen': 27523962, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.437', 'grad_norm': '1.226', 'learning_rate': '4.971e-05', 'epoch': '0.3386', 'num_input_tokens_seen': 27526009, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.028', 'grad_norm': '2.857', 'learning_rate': '4.971e-05', 'epoch': '0.3386', 'num_input_tokens_seen': 27528056, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5001', 'grad_norm': '1.08', 'learning_rate': '4.971e-05', 'epoch': '0.3386', 'num_input_tokens_seen': 27530103, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8397', 'grad_norm': '1.187', 'learning_rate': '4.971e-05', 'epoch': '0.3386', 'num_input_tokens_seen': 27532150, 'train_runtime': '1.393e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4488', 'grad_norm': '1.043', 'learning_rate': '4.971e-05', 'epoch': '0.3387', 'num_input_tokens_seen': 27534197, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2854', 'grad_norm': '0.8202', 'learning_rate': '4.971e-05', 'epoch': '0.3387', 'num_input_tokens_seen': 27536244, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2381', 'grad_norm': '0.7663', 'learning_rate': '4.971e-05', 'epoch': '0.3387', 'num_input_tokens_seen': 27538291, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2419', 'grad_norm': '0.7801', 'learning_rate': '4.971e-05', 'epoch': '0.3387', 'num_input_tokens_seen': 27540338, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.681', 'grad_norm': '1.434', 'learning_rate': '4.971e-05', 'epoch': '0.3388', 'num_input_tokens_seen': 27542385, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5741', 'grad_norm': '1.344', 'learning_rate': '4.971e-05', 'epoch': '0.3388', 'num_input_tokens_seen': 27544432, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3504', 'grad_norm': '1.059', 'learning_rate': '4.971e-05', 'epoch': '0.3388', 'num_input_tokens_seen': 27546479, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6041', 'grad_norm': '1.097', 'learning_rate': '4.971e-05', 'epoch': '0.3388', 'num_input_tokens_seen': 27548526, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4606', 'grad_norm': '1.222', 'learning_rate': '4.971e-05', 'epoch': '0.3389', 'num_input_tokens_seen': 27550573, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.291', 'grad_norm': '2.079', 'learning_rate': '4.971e-05', 'epoch': '0.3389', 'num_input_tokens_seen': 27552620, 'train_runtime': '1.394e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8226', 'grad_norm': '1.352', 'learning_rate': '4.971e-05', 'epoch': '0.3389', 'num_input_tokens_seen': 27554667, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3023', 'grad_norm': '0.7737', 'learning_rate': '4.971e-05', 'epoch': '0.3389', 'num_input_tokens_seen': 27556714, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6935', 'grad_norm': '1.292', 'learning_rate': '4.971e-05', 'epoch': '0.339', 'num_input_tokens_seen': 27558761, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.044', 'grad_norm': '2.051', 'learning_rate': '4.971e-05', 'epoch': '0.339', 'num_input_tokens_seen': 27560808, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5399', 'grad_norm': '0.8853', 'learning_rate': '4.971e-05', 'epoch': '0.339', 'num_input_tokens_seen': 27562855, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5048', 'grad_norm': '1.139', 'learning_rate': '4.971e-05', 'epoch': '0.339', 'num_input_tokens_seen': 27564902, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2694', 'grad_norm': '0.8325', 'learning_rate': '4.971e-05', 'epoch': '0.3391', 'num_input_tokens_seen': 27566949, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7608', 'grad_norm': '1.273', 'learning_rate': '4.971e-05', 'epoch': '0.3391', 'num_input_tokens_seen': 27568996, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2739', 'grad_norm': '0.881', 'learning_rate': '4.971e-05', 'epoch': '0.3391', 'num_input_tokens_seen': 27571043, 'train_runtime': '1.395e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5166', 'grad_norm': '1.149', 'learning_rate': '4.971e-05', 'epoch': '0.3391', 'num_input_tokens_seen': 27573090, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.258', 'grad_norm': '2.187', 'learning_rate': '4.971e-05', 'epoch': '0.3392', 'num_input_tokens_seen': 27575137, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4174', 'grad_norm': '1.061', 'learning_rate': '4.971e-05', 'epoch': '0.3392', 'num_input_tokens_seen': 27577184, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.038', 'grad_norm': '2.405', 'learning_rate': '4.971e-05', 'epoch': '0.3392', 'num_input_tokens_seen': 27579231, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3055', 'grad_norm': '0.7379', 'learning_rate': '4.971e-05', 'epoch': '0.3393', 'num_input_tokens_seen': 27581278, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4391', 'grad_norm': '0.934', 'learning_rate': '4.971e-05', 'epoch': '0.3393', 'num_input_tokens_seen': 27583325, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8994', 'grad_norm': '1.515', 'learning_rate': '4.971e-05', 'epoch': '0.3393', 'num_input_tokens_seen': 27585372, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.094', 'grad_norm': '1.748', 'learning_rate': '4.971e-05', 'epoch': '0.3393', 'num_input_tokens_seen': 27587419, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7724', 'grad_norm': '1.442', 'learning_rate': '4.971e-05', 'epoch': '0.3394', 'num_input_tokens_seen': 27589466, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7051', 'grad_norm': '1.225', 'learning_rate': '4.971e-05', 'epoch': '0.3394', 'num_input_tokens_seen': 27591513, 'train_runtime': '1.396e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.105', 'grad_norm': '1.712', 'learning_rate': '4.971e-05', 'epoch': '0.3394', 'num_input_tokens_seen': 27593560, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1982', 'grad_norm': '0.79', 'learning_rate': '4.971e-05', 'epoch': '0.3394', 'num_input_tokens_seen': 27595607, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3385', 'grad_norm': '0.8314', 'learning_rate': '4.971e-05', 'epoch': '0.3395', 'num_input_tokens_seen': 27597654, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.784', 'grad_norm': '1.163', 'learning_rate': '4.971e-05', 'epoch': '0.3395', 'num_input_tokens_seen': 27599701, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4101', 'grad_norm': '1.025', 'learning_rate': '4.971e-05', 'epoch': '0.3395', 'num_input_tokens_seen': 27601748, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4854', 'grad_norm': '1.218', 'learning_rate': '4.971e-05', 'epoch': '0.3395', 'num_input_tokens_seen': 27603795, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4773', 'grad_norm': '1.359', 'learning_rate': '4.971e-05', 'epoch': '0.3396', 'num_input_tokens_seen': 27605842, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.289', 'grad_norm': '1.859', 'learning_rate': '4.971e-05', 'epoch': '0.3396', 'num_input_tokens_seen': 27607889, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4949', 'grad_norm': '1.257', 'learning_rate': '4.971e-05', 'epoch': '0.3396', 'num_input_tokens_seen': 27609936, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4009', 'grad_norm': '1.134', 'learning_rate': '4.971e-05', 'epoch': '0.3396', 'num_input_tokens_seen': 27611983, 'train_runtime': '1.397e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2115', 'grad_norm': '0.7238', 'learning_rate': '4.971e-05', 'epoch': '0.3397', 'num_input_tokens_seen': 27614030, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2915', 'grad_norm': '0.9631', 'learning_rate': '4.971e-05', 'epoch': '0.3397', 'num_input_tokens_seen': 27616077, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7988', 'grad_norm': '1.224', 'learning_rate': '4.971e-05', 'epoch': '0.3397', 'num_input_tokens_seen': 27618124, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7944', 'grad_norm': '1.29', 'learning_rate': '4.971e-05', 'epoch': '0.3397', 'num_input_tokens_seen': 27620171, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.724', 'grad_norm': '1.384', 'learning_rate': '4.971e-05', 'epoch': '0.3398', 'num_input_tokens_seen': 27622218, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.045', 'grad_norm': '1.999', 'learning_rate': '4.971e-05', 'epoch': '0.3398', 'num_input_tokens_seen': 27624265, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8355', 'grad_norm': '2.079', 'learning_rate': '4.971e-05', 'epoch': '0.3398', 'num_input_tokens_seen': 27626312, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9285', 'grad_norm': '1.677', 'learning_rate': '4.971e-05', 'epoch': '0.3398', 'num_input_tokens_seen': 27628359, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7234', 'grad_norm': '1.206', 'learning_rate': '4.971e-05', 'epoch': '0.3399', 'num_input_tokens_seen': 27630406, 'train_runtime': '1.398e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.275', 'grad_norm': '2.663', 'learning_rate': '4.97e-05', 'epoch': '0.3399', 'num_input_tokens_seen': 27632453, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8178', 'grad_norm': '1.263', 'learning_rate': '4.97e-05', 'epoch': '0.3399', 'num_input_tokens_seen': 27634500, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.084', 'grad_norm': '1.671', 'learning_rate': '4.97e-05', 'epoch': '0.3399', 'num_input_tokens_seen': 27636547, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.165', 'grad_norm': '2.105', 'learning_rate': '4.97e-05', 'epoch': '0.34', 'num_input_tokens_seen': 27638594, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7561', 'grad_norm': '1.17', 'learning_rate': '4.97e-05', 'epoch': '0.34', 'num_input_tokens_seen': 27640641, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3719', 'grad_norm': '0.945', 'learning_rate': '4.97e-05', 'epoch': '0.34', 'num_input_tokens_seen': 27642688, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4921', 'grad_norm': '1.032', 'learning_rate': '4.97e-05', 'epoch': '0.34', 'num_input_tokens_seen': 27644735, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2573', 'grad_norm': '0.7741', 'learning_rate': '4.97e-05', 'epoch': '0.3401', 'num_input_tokens_seen': 27646782, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.288', 'grad_norm': '2.102', 'learning_rate': '4.97e-05', 'epoch': '0.3401', 'num_input_tokens_seen': 27648829, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9368', 'grad_norm': '1.65', 'learning_rate': '4.97e-05', 'epoch': '0.3401', 'num_input_tokens_seen': 27650876, 'train_runtime': '1.399e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8639', 'grad_norm': '1.995', 'learning_rate': '4.97e-05', 'epoch': '0.3401', 'num_input_tokens_seen': 27652923, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.36', 'grad_norm': '2.45', 'learning_rate': '4.97e-05', 'epoch': '0.3402', 'num_input_tokens_seen': 27654970, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8518', 'grad_norm': '1.407', 'learning_rate': '4.97e-05', 'epoch': '0.3402', 'num_input_tokens_seen': 27657017, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8631', 'grad_norm': '1.431', 'learning_rate': '4.97e-05', 'epoch': '0.3402', 'num_input_tokens_seen': 27659064, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9982', 'grad_norm': '1.324', 'learning_rate': '4.97e-05', 'epoch': '0.3402', 'num_input_tokens_seen': 27661111, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.484', 'grad_norm': '1.181', 'learning_rate': '4.97e-05', 'epoch': '0.3403', 'num_input_tokens_seen': 27663158, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.536', 'grad_norm': '1.202', 'learning_rate': '4.97e-05', 'epoch': '0.3403', 'num_input_tokens_seen': 27665205, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.135', 'grad_norm': '2.008', 'learning_rate': '4.97e-05', 'epoch': '0.3403', 'num_input_tokens_seen': 27667252, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.82', 'grad_norm': '2.805', 'learning_rate': '4.97e-05', 'epoch': '0.3403', 'num_input_tokens_seen': 27669299, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5547', 'grad_norm': '1.181', 'learning_rate': '4.97e-05', 'epoch': '0.3404', 'num_input_tokens_seen': 27671346, 'train_runtime': '1.4e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5076', 'grad_norm': '1.117', 'learning_rate': '4.97e-05', 'epoch': '0.3404', 'num_input_tokens_seen': 27673393, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3575', 'grad_norm': '1.02', 'learning_rate': '4.97e-05', 'epoch': '0.3404', 'num_input_tokens_seen': 27675440, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9243', 'grad_norm': '1.395', 'learning_rate': '4.97e-05', 'epoch': '0.3404', 'num_input_tokens_seen': 27677487, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.296', 'grad_norm': '1.446', 'learning_rate': '4.97e-05', 'epoch': '0.3405', 'num_input_tokens_seen': 27679534, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7671', 'grad_norm': '2.127', 'learning_rate': '4.97e-05', 'epoch': '0.3405', 'num_input_tokens_seen': 27681581, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4024', 'grad_norm': '1.405', 'learning_rate': '4.97e-05', 'epoch': '0.3405', 'num_input_tokens_seen': 27683628, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3235', 'grad_norm': '0.7945', 'learning_rate': '4.97e-05', 'epoch': '0.3405', 'num_input_tokens_seen': 27685675, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3185', 'grad_norm': '0.796', 'learning_rate': '4.97e-05', 'epoch': '0.3406', 'num_input_tokens_seen': 27687722, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.332', 'grad_norm': '2.136', 'learning_rate': '4.97e-05', 'epoch': '0.3406', 'num_input_tokens_seen': 27689769, 'train_runtime': '1.401e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.303', 'grad_norm': '2.192', 'learning_rate': '4.97e-05', 'epoch': '0.3406', 'num_input_tokens_seen': 27691816, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8819', 'grad_norm': '1.744', 'learning_rate': '4.97e-05', 'epoch': '0.3406', 'num_input_tokens_seen': 27693863, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.257', 'grad_norm': '1.957', 'learning_rate': '4.97e-05', 'epoch': '0.3407', 'num_input_tokens_seen': 27695910, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2483', 'grad_norm': '0.7777', 'learning_rate': '4.97e-05', 'epoch': '0.3407', 'num_input_tokens_seen': 27697957, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2936', 'grad_norm': '0.9456', 'learning_rate': '4.97e-05', 'epoch': '0.3407', 'num_input_tokens_seen': 27700004, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.791', 'grad_norm': '1.776', 'learning_rate': '4.97e-05', 'epoch': '0.3407', 'num_input_tokens_seen': 27702051, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.054', 'grad_norm': '2.468', 'learning_rate': '4.97e-05', 'epoch': '0.3408', 'num_input_tokens_seen': 27704098, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.439', 'grad_norm': '1.136', 'learning_rate': '4.97e-05', 'epoch': '0.3408', 'num_input_tokens_seen': 27706145, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.304', 'grad_norm': '0.959', 'learning_rate': '4.97e-05', 'epoch': '0.3408', 'num_input_tokens_seen': 27708192, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5816', 'grad_norm': '1.225', 'learning_rate': '4.97e-05', 'epoch': '0.3408', 'num_input_tokens_seen': 27710239, 'train_runtime': '1.402e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3449', 'grad_norm': '0.9061', 'learning_rate': '4.97e-05', 'epoch': '0.3409', 'num_input_tokens_seen': 27712286, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5334', 'grad_norm': '1.261', 'learning_rate': '4.97e-05', 'epoch': '0.3409', 'num_input_tokens_seen': 27714333, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.005', 'grad_norm': '1.615', 'learning_rate': '4.97e-05', 'epoch': '0.3409', 'num_input_tokens_seen': 27716380, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.288', 'grad_norm': '2.117', 'learning_rate': '4.97e-05', 'epoch': '0.3409', 'num_input_tokens_seen': 27718427, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4253', 'grad_norm': '0.9223', 'learning_rate': '4.97e-05', 'epoch': '0.341', 'num_input_tokens_seen': 27720474, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.035', 'grad_norm': '2.007', 'learning_rate': '4.97e-05', 'epoch': '0.341', 'num_input_tokens_seen': 27722521, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6836', 'grad_norm': '1.155', 'learning_rate': '4.97e-05', 'epoch': '0.341', 'num_input_tokens_seen': 27724568, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4321', 'grad_norm': '1.069', 'learning_rate': '4.97e-05', 'epoch': '0.341', 'num_input_tokens_seen': 27726615, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2451', 'grad_norm': '0.8625', 'learning_rate': '4.97e-05', 'epoch': '0.3411', 'num_input_tokens_seen': 27728662, 'train_runtime': '1.403e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3869', 'grad_norm': '1.153', 'learning_rate': '4.97e-05', 'epoch': '0.3411', 'num_input_tokens_seen': 27730709, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.864', 'grad_norm': '1.258', 'learning_rate': '4.97e-05', 'epoch': '0.3411', 'num_input_tokens_seen': 27732756, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4863', 'grad_norm': '1.435', 'learning_rate': '4.97e-05', 'epoch': '0.3411', 'num_input_tokens_seen': 27734803, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.027', 'grad_norm': '1.841', 'learning_rate': '4.97e-05', 'epoch': '0.3412', 'num_input_tokens_seen': 27736850, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.272', 'grad_norm': '2.55', 'learning_rate': '4.97e-05', 'epoch': '0.3412', 'num_input_tokens_seen': 27738897, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3397', 'grad_norm': '0.8855', 'learning_rate': '4.97e-05', 'epoch': '0.3412', 'num_input_tokens_seen': 27740944, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7943', 'grad_norm': '1.096', 'learning_rate': '4.97e-05', 'epoch': '0.3412', 'num_input_tokens_seen': 27742991, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8354', 'grad_norm': '1.355', 'learning_rate': '4.97e-05', 'epoch': '0.3413', 'num_input_tokens_seen': 27745038, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4751', 'grad_norm': '1.11', 'learning_rate': '4.97e-05', 'epoch': '0.3413', 'num_input_tokens_seen': 27747085, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8388', 'grad_norm': '1.162', 'learning_rate': '4.97e-05', 'epoch': '0.3413', 'num_input_tokens_seen': 27749132, 'train_runtime': '1.404e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.387', 'grad_norm': '0.9166', 'learning_rate': '4.97e-05', 'epoch': '0.3413', 'num_input_tokens_seen': 27751179, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.706', 'grad_norm': '1.187', 'learning_rate': '4.97e-05', 'epoch': '0.3414', 'num_input_tokens_seen': 27753226, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3789', 'grad_norm': '0.9927', 'learning_rate': '4.97e-05', 'epoch': '0.3414', 'num_input_tokens_seen': 27755273, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.122', 'grad_norm': '1.878', 'learning_rate': '4.97e-05', 'epoch': '0.3414', 'num_input_tokens_seen': 27757320, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3991', 'grad_norm': '0.9487', 'learning_rate': '4.97e-05', 'epoch': '0.3414', 'num_input_tokens_seen': 27759367, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5529', 'grad_norm': '1.606', 'learning_rate': '4.97e-05', 'epoch': '0.3415', 'num_input_tokens_seen': 27761414, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.624', 'grad_norm': '2.49', 'learning_rate': '4.97e-05', 'epoch': '0.3415', 'num_input_tokens_seen': 27763461, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3984', 'grad_norm': '1.165', 'learning_rate': '4.97e-05', 'epoch': '0.3415', 'num_input_tokens_seen': 27765508, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.132', 'grad_norm': '1.641', 'learning_rate': '4.97e-05', 'epoch': '0.3415', 'num_input_tokens_seen': 27767555, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4488', 'grad_norm': '1.224', 'learning_rate': '4.97e-05', 'epoch': '0.3416', 'num_input_tokens_seen': 27769602, 'train_runtime': '1.405e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4656', 'grad_norm': '1.035', 'learning_rate': '4.97e-05', 'epoch': '0.3416', 'num_input_tokens_seen': 27771649, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.327', 'grad_norm': '2.781', 'learning_rate': '4.97e-05', 'epoch': '0.3416', 'num_input_tokens_seen': 27773696, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.381', 'grad_norm': '0.9645', 'learning_rate': '4.97e-05', 'epoch': '0.3416', 'num_input_tokens_seen': 27775743, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.024', 'grad_norm': '2.023', 'learning_rate': '4.97e-05', 'epoch': '0.3417', 'num_input_tokens_seen': 27777790, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2123', 'grad_norm': '0.6896', 'learning_rate': '4.97e-05', 'epoch': '0.3417', 'num_input_tokens_seen': 27779837, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7462', 'grad_norm': '1.856', 'learning_rate': '4.97e-05', 'epoch': '0.3417', 'num_input_tokens_seen': 27781884, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1737', 'grad_norm': '0.8299', 'learning_rate': '4.97e-05', 'epoch': '0.3417', 'num_input_tokens_seen': 27783931, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6298', 'grad_norm': '1.484', 'learning_rate': '4.97e-05', 'epoch': '0.3418', 'num_input_tokens_seen': 27785978, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.586', 'grad_norm': '1.692', 'learning_rate': '4.97e-05', 'epoch': '0.3418', 'num_input_tokens_seen': 27788025, 'train_runtime': '1.406e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8094', 'grad_norm': '1.934', 'learning_rate': '4.97e-05', 'epoch': '0.3418', 'num_input_tokens_seen': 27790072, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4757', 'grad_norm': '2.577', 'learning_rate': '4.97e-05', 'epoch': '0.3418', 'num_input_tokens_seen': 27792119, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3739', 'grad_norm': '0.7904', 'learning_rate': '4.97e-05', 'epoch': '0.3419', 'num_input_tokens_seen': 27794166, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3544', 'grad_norm': '0.9545', 'learning_rate': '4.97e-05', 'epoch': '0.3419', 'num_input_tokens_seen': 27796213, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7271', 'grad_norm': '1.758', 'learning_rate': '4.97e-05', 'epoch': '0.3419', 'num_input_tokens_seen': 27798260, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7979', 'grad_norm': '1.327', 'learning_rate': '4.97e-05', 'epoch': '0.3419', 'num_input_tokens_seen': 27800307, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.126', 'grad_norm': '2.12', 'learning_rate': '4.97e-05', 'epoch': '0.342', 'num_input_tokens_seen': 27802354, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6519', 'grad_norm': '1.274', 'learning_rate': '4.97e-05', 'epoch': '0.342', 'num_input_tokens_seen': 27804401, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3886', 'grad_norm': '0.8828', 'learning_rate': '4.97e-05', 'epoch': '0.342', 'num_input_tokens_seen': 27806448, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3305', 'grad_norm': '0.9492', 'learning_rate': '4.97e-05', 'epoch': '0.342', 'num_input_tokens_seen': 27808495, 'train_runtime': '1.407e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4589', 'grad_norm': '1.031', 'learning_rate': '4.97e-05', 'epoch': '0.3421', 'num_input_tokens_seen': 27810542, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7044', 'grad_norm': '1.212', 'learning_rate': '4.97e-05', 'epoch': '0.3421', 'num_input_tokens_seen': 27812589, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.2', 'grad_norm': '1.866', 'learning_rate': '4.97e-05', 'epoch': '0.3421', 'num_input_tokens_seen': 27814636, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3598', 'grad_norm': '1.082', 'learning_rate': '4.97e-05', 'epoch': '0.3421', 'num_input_tokens_seen': 27816683, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3554', 'grad_norm': '1.073', 'learning_rate': '4.97e-05', 'epoch': '0.3422', 'num_input_tokens_seen': 27818730, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3486', 'grad_norm': '1.061', 'learning_rate': '4.97e-05', 'epoch': '0.3422', 'num_input_tokens_seen': 27820777, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.441', 'grad_norm': '2.536', 'learning_rate': '4.97e-05', 'epoch': '0.3422', 'num_input_tokens_seen': 27822824, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6078', 'grad_norm': '1.275', 'learning_rate': '4.97e-05', 'epoch': '0.3422', 'num_input_tokens_seen': 27824871, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1861', 'grad_norm': '0.822', 'learning_rate': '4.97e-05', 'epoch': '0.3423', 'num_input_tokens_seen': 27826918, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.145', 'grad_norm': '2.009', 'learning_rate': '4.97e-05', 'epoch': '0.3423', 'num_input_tokens_seen': 27828965, 'train_runtime': '1.408e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6614', 'grad_norm': '1.567', 'learning_rate': '4.97e-05', 'epoch': '0.3423', 'num_input_tokens_seen': 27831012, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8095', 'grad_norm': '1.453', 'learning_rate': '4.97e-05', 'epoch': '0.3423', 'num_input_tokens_seen': 27833059, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6471', 'grad_norm': '1.393', 'learning_rate': '4.97e-05', 'epoch': '0.3424', 'num_input_tokens_seen': 27835106, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4075', 'grad_norm': '1.158', 'learning_rate': '4.97e-05', 'epoch': '0.3424', 'num_input_tokens_seen': 27837153, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.91', 'grad_norm': '2.638', 'learning_rate': '4.97e-05', 'epoch': '0.3424', 'num_input_tokens_seen': 27839200, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.537', 'grad_norm': '0.8956', 'learning_rate': '4.97e-05', 'epoch': '0.3424', 'num_input_tokens_seen': 27841247, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3749', 'grad_norm': '1.041', 'learning_rate': '4.97e-05', 'epoch': '0.3425', 'num_input_tokens_seen': 27843294, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6055', 'grad_norm': '1.22', 'learning_rate': '4.97e-05', 'epoch': '0.3425', 'num_input_tokens_seen': 27845341, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6174', 'grad_norm': '1.466', 'learning_rate': '4.97e-05', 'epoch': '0.3425', 'num_input_tokens_seen': 27847388, 'train_runtime': '1.409e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8494', 'grad_norm': '2.107', 'learning_rate': '4.97e-05', 'epoch': '0.3425', 'num_input_tokens_seen': 27849435, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8992', 'grad_norm': '1.689', 'learning_rate': '4.97e-05', 'epoch': '0.3426', 'num_input_tokens_seen': 27851482, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6612', 'grad_norm': '1.234', 'learning_rate': '4.97e-05', 'epoch': '0.3426', 'num_input_tokens_seen': 27853529, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8312', 'grad_norm': '1.435', 'learning_rate': '4.97e-05', 'epoch': '0.3426', 'num_input_tokens_seen': 27855576, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6904', 'grad_norm': '1.069', 'learning_rate': '4.97e-05', 'epoch': '0.3426', 'num_input_tokens_seen': 27857623, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4013', 'grad_norm': '1.021', 'learning_rate': '4.97e-05', 'epoch': '0.3427', 'num_input_tokens_seen': 27859670, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8839', 'grad_norm': '1.935', 'learning_rate': '4.97e-05', 'epoch': '0.3427', 'num_input_tokens_seen': 27861717, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.513', 'grad_norm': '1.252', 'learning_rate': '4.97e-05', 'epoch': '0.3427', 'num_input_tokens_seen': 27863764, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8723', 'grad_norm': '1.498', 'learning_rate': '4.97e-05', 'epoch': '0.3427', 'num_input_tokens_seen': 27865811, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4465', 'grad_norm': '1.082', 'learning_rate': '4.97e-05', 'epoch': '0.3428', 'num_input_tokens_seen': 27867858, 'train_runtime': '1.41e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5977', 'grad_norm': '1.493', 'learning_rate': '4.97e-05', 'epoch': '0.3428', 'num_input_tokens_seen': 27869905, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2979', 'grad_norm': '1.007', 'learning_rate': '4.97e-05', 'epoch': '0.3428', 'num_input_tokens_seen': 27871952, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.173', 'grad_norm': '2.135', 'learning_rate': '4.97e-05', 'epoch': '0.3429', 'num_input_tokens_seen': 27873999, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4403', 'grad_norm': '1.359', 'learning_rate': '4.97e-05', 'epoch': '0.3429', 'num_input_tokens_seen': 27876046, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.274', 'grad_norm': '2.285', 'learning_rate': '4.97e-05', 'epoch': '0.3429', 'num_input_tokens_seen': 27878093, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.181', 'grad_norm': '2.137', 'learning_rate': '4.97e-05', 'epoch': '0.3429', 'num_input_tokens_seen': 27880140, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2666', 'grad_norm': '0.7455', 'learning_rate': '4.97e-05', 'epoch': '0.343', 'num_input_tokens_seen': 27882187, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4297', 'grad_norm': '1.145', 'learning_rate': '4.97e-05', 'epoch': '0.343', 'num_input_tokens_seen': 27884234, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3541', 'grad_norm': '0.7782', 'learning_rate': '4.97e-05', 'epoch': '0.343', 'num_input_tokens_seen': 27886281, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.7', 'grad_norm': '2.75', 'learning_rate': '4.97e-05', 'epoch': '0.343', 'num_input_tokens_seen': 27888328, 'train_runtime': '1.411e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7536', 'grad_norm': '0.9564', 'learning_rate': '4.97e-05', 'epoch': '0.3431', 'num_input_tokens_seen': 27890375, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.17', 'grad_norm': '2.365', 'learning_rate': '4.97e-05', 'epoch': '0.3431', 'num_input_tokens_seen': 27892422, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7969', 'grad_norm': '1.501', 'learning_rate': '4.97e-05', 'epoch': '0.3431', 'num_input_tokens_seen': 27894469, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4781', 'grad_norm': '1.21', 'learning_rate': '4.97e-05', 'epoch': '0.3431', 'num_input_tokens_seen': 27896516, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4904', 'grad_norm': '1.062', 'learning_rate': '4.97e-05', 'epoch': '0.3432', 'num_input_tokens_seen': 27898563, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.347', 'grad_norm': '2.343', 'learning_rate': '4.97e-05', 'epoch': '0.3432', 'num_input_tokens_seen': 27900610, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8367', 'grad_norm': '1.549', 'learning_rate': '4.97e-05', 'epoch': '0.3432', 'num_input_tokens_seen': 27902657, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6176', 'grad_norm': '0.8668', 'learning_rate': '4.97e-05', 'epoch': '0.3432', 'num_input_tokens_seen': 27904704, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4159', 'grad_norm': '1.19', 'learning_rate': '4.97e-05', 'epoch': '0.3433', 'num_input_tokens_seen': 27906751, 'train_runtime': '1.412e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3989', 'grad_norm': '1.114', 'learning_rate': '4.97e-05', 'epoch': '0.3433', 'num_input_tokens_seen': 27908798, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4054', 'grad_norm': '1.032', 'learning_rate': '4.97e-05', 'epoch': '0.3433', 'num_input_tokens_seen': 27910845, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.455', 'grad_norm': '2.524', 'learning_rate': '4.97e-05', 'epoch': '0.3433', 'num_input_tokens_seen': 27912892, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2455', 'grad_norm': '0.851', 'learning_rate': '4.97e-05', 'epoch': '0.3434', 'num_input_tokens_seen': 27914939, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3063', 'grad_norm': '0.7627', 'learning_rate': '4.97e-05', 'epoch': '0.3434', 'num_input_tokens_seen': 27916986, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.703', 'grad_norm': '2.71', 'learning_rate': '4.97e-05', 'epoch': '0.3434', 'num_input_tokens_seen': 27919033, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.555', 'grad_norm': '2.867', 'learning_rate': '4.97e-05', 'epoch': '0.3434', 'num_input_tokens_seen': 27921080, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4025', 'grad_norm': '0.9311', 'learning_rate': '4.97e-05', 'epoch': '0.3435', 'num_input_tokens_seen': 27923127, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5738', 'grad_norm': '1.378', 'learning_rate': '4.97e-05', 'epoch': '0.3435', 'num_input_tokens_seen': 27925174, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.212', 'grad_norm': '2.076', 'learning_rate': '4.97e-05', 'epoch': '0.3435', 'num_input_tokens_seen': 27927221, 'train_runtime': '1.413e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5077', 'grad_norm': '1.185', 'learning_rate': '4.97e-05', 'epoch': '0.3435', 'num_input_tokens_seen': 27929268, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7297', 'grad_norm': '1.859', 'learning_rate': '4.97e-05', 'epoch': '0.3436', 'num_input_tokens_seen': 27931315, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5391', 'grad_norm': '1.215', 'learning_rate': '4.97e-05', 'epoch': '0.3436', 'num_input_tokens_seen': 27933362, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.209', 'grad_norm': '0.7549', 'learning_rate': '4.97e-05', 'epoch': '0.3436', 'num_input_tokens_seen': 27935409, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4417', 'grad_norm': '1.071', 'learning_rate': '4.97e-05', 'epoch': '0.3436', 'num_input_tokens_seen': 27937456, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4795', 'grad_norm': '1.124', 'learning_rate': '4.97e-05', 'epoch': '0.3437', 'num_input_tokens_seen': 27939503, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.128', 'grad_norm': '1.444', 'learning_rate': '4.97e-05', 'epoch': '0.3437', 'num_input_tokens_seen': 27941550, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7084', 'grad_norm': '1.248', 'learning_rate': '4.97e-05', 'epoch': '0.3437', 'num_input_tokens_seen': 27943597, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.714', 'grad_norm': '2.411', 'learning_rate': '4.97e-05', 'epoch': '0.3437', 'num_input_tokens_seen': 27945644, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6069', 'grad_norm': '1.332', 'learning_rate': '4.97e-05', 'epoch': '0.3438', 'num_input_tokens_seen': 27947691, 'train_runtime': '1.414e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2285', 'grad_norm': '0.6736', 'learning_rate': '4.97e-05', 'epoch': '0.3438', 'num_input_tokens_seen': 27949738, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5519', 'grad_norm': '1.08', 'learning_rate': '4.97e-05', 'epoch': '0.3438', 'num_input_tokens_seen': 27951785, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8351', 'grad_norm': '1.33', 'learning_rate': '4.97e-05', 'epoch': '0.3438', 'num_input_tokens_seen': 27953832, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5716', 'grad_norm': '0.8641', 'learning_rate': '4.97e-05', 'epoch': '0.3439', 'num_input_tokens_seen': 27955879, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5268', 'grad_norm': '1.272', 'learning_rate': '4.97e-05', 'epoch': '0.3439', 'num_input_tokens_seen': 27957926, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6465', 'grad_norm': '1.173', 'learning_rate': '4.97e-05', 'epoch': '0.3439', 'num_input_tokens_seen': 27959973, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3665', 'grad_norm': '0.8493', 'learning_rate': '4.969e-05', 'epoch': '0.3439', 'num_input_tokens_seen': 27962020, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.08', 'grad_norm': '1.793', 'learning_rate': '4.969e-05', 'epoch': '0.344', 'num_input_tokens_seen': 27964067, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8708', 'grad_norm': '1.564', 'learning_rate': '4.969e-05', 'epoch': '0.344', 'num_input_tokens_seen': 27966114, 'train_runtime': '1.415e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8706', 'grad_norm': '1.445', 'learning_rate': '4.969e-05', 'epoch': '0.344', 'num_input_tokens_seen': 27968161, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5762', 'grad_norm': '0.9362', 'learning_rate': '4.969e-05', 'epoch': '0.344', 'num_input_tokens_seen': 27970208, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3496', 'grad_norm': '0.8422', 'learning_rate': '4.969e-05', 'epoch': '0.3441', 'num_input_tokens_seen': 27972255, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9394', 'grad_norm': '1.811', 'learning_rate': '4.969e-05', 'epoch': '0.3441', 'num_input_tokens_seen': 27974302, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3207', 'grad_norm': '0.8292', 'learning_rate': '4.969e-05', 'epoch': '0.3441', 'num_input_tokens_seen': 27976349, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.615', 'grad_norm': '1.318', 'learning_rate': '4.969e-05', 'epoch': '0.3441', 'num_input_tokens_seen': 27978396, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6114', 'grad_norm': '1.257', 'learning_rate': '4.969e-05', 'epoch': '0.3442', 'num_input_tokens_seen': 27980443, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4586', 'grad_norm': '0.7022', 'learning_rate': '4.969e-05', 'epoch': '0.3442', 'num_input_tokens_seen': 27982490, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2908', 'grad_norm': '0.8206', 'learning_rate': '4.969e-05', 'epoch': '0.3442', 'num_input_tokens_seen': 27984537, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3919', 'grad_norm': '1.077', 'learning_rate': '4.969e-05', 'epoch': '0.3442', 'num_input_tokens_seen': 27986584, 'train_runtime': '1.416e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3185', 'grad_norm': '0.9088', 'learning_rate': '4.969e-05', 'epoch': '0.3443', 'num_input_tokens_seen': 27988631, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4084', 'grad_norm': '1.512', 'learning_rate': '4.969e-05', 'epoch': '0.3443', 'num_input_tokens_seen': 27990678, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4981', 'grad_norm': '1.413', 'learning_rate': '4.969e-05', 'epoch': '0.3443', 'num_input_tokens_seen': 27992725, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5452', 'grad_norm': '1.126', 'learning_rate': '4.969e-05', 'epoch': '0.3443', 'num_input_tokens_seen': 27994772, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.256', 'grad_norm': '1.001', 'learning_rate': '4.969e-05', 'epoch': '0.3444', 'num_input_tokens_seen': 27996819, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7804', 'grad_norm': '1.435', 'learning_rate': '4.969e-05', 'epoch': '0.3444', 'num_input_tokens_seen': 27998866, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2531', 'grad_norm': '0.8712', 'learning_rate': '4.969e-05', 'epoch': '0.3444', 'num_input_tokens_seen': 28000913, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.371', 'grad_norm': '1.906', 'learning_rate': '4.969e-05', 'epoch': '0.3444', 'num_input_tokens_seen': 28002960, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7532', 'grad_norm': '1.309', 'learning_rate': '4.969e-05', 'epoch': '0.3445', 'num_input_tokens_seen': 28005007, 'train_runtime': '1.417e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3303', 'grad_norm': '0.9263', 'learning_rate': '4.969e-05', 'epoch': '0.3445', 'num_input_tokens_seen': 28007054, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6877', 'grad_norm': '1.053', 'learning_rate': '4.969e-05', 'epoch': '0.3445', 'num_input_tokens_seen': 28009101, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4393', 'grad_norm': '1.356', 'learning_rate': '4.969e-05', 'epoch': '0.3445', 'num_input_tokens_seen': 28011148, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.385', 'grad_norm': '0.8968', 'learning_rate': '4.969e-05', 'epoch': '0.3446', 'num_input_tokens_seen': 28013195, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4776', 'grad_norm': '1.109', 'learning_rate': '4.969e-05', 'epoch': '0.3446', 'num_input_tokens_seen': 28015242, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6383', 'grad_norm': '1.21', 'learning_rate': '4.969e-05', 'epoch': '0.3446', 'num_input_tokens_seen': 28017289, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2716', 'grad_norm': '0.7504', 'learning_rate': '4.969e-05', 'epoch': '0.3446', 'num_input_tokens_seen': 28019336, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5908', 'grad_norm': '1.116', 'learning_rate': '4.969e-05', 'epoch': '0.3447', 'num_input_tokens_seen': 28021383, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.302', 'grad_norm': '3.38', 'learning_rate': '4.969e-05', 'epoch': '0.3447', 'num_input_tokens_seen': 28023430, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.433', 'grad_norm': '2.366', 'learning_rate': '4.969e-05', 'epoch': '0.3447', 'num_input_tokens_seen': 28025477, 'train_runtime': '1.418e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.083', 'grad_norm': '1.754', 'learning_rate': '4.969e-05', 'epoch': '0.3447', 'num_input_tokens_seen': 28027524, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.009', 'grad_norm': '3.381', 'learning_rate': '4.969e-05', 'epoch': '0.3448', 'num_input_tokens_seen': 28029571, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.112', 'grad_norm': '2.072', 'learning_rate': '4.969e-05', 'epoch': '0.3448', 'num_input_tokens_seen': 28031618, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5893', 'grad_norm': '1.326', 'learning_rate': '4.969e-05', 'epoch': '0.3448', 'num_input_tokens_seen': 28033665, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4593', 'grad_norm': '0.8761', 'learning_rate': '4.969e-05', 'epoch': '0.3448', 'num_input_tokens_seen': 28035712, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6938', 'grad_norm': '1.243', 'learning_rate': '4.969e-05', 'epoch': '0.3449', 'num_input_tokens_seen': 28037759, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3378', 'grad_norm': '0.8784', 'learning_rate': '4.969e-05', 'epoch': '0.3449', 'num_input_tokens_seen': 28039806, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4202', 'grad_norm': '1.163', 'learning_rate': '4.969e-05', 'epoch': '0.3449', 'num_input_tokens_seen': 28041853, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9488', 'grad_norm': '1.335', 'learning_rate': '4.969e-05', 'epoch': '0.3449', 'num_input_tokens_seen': 28043900, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '2.014', 'learning_rate': '4.969e-05', 'epoch': '0.345', 'num_input_tokens_seen': 28045947, 'train_runtime': '1.419e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6934', 'grad_norm': '1.588', 'learning_rate': '4.969e-05', 'epoch': '0.345', 'num_input_tokens_seen': 28047994, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4825', 'grad_norm': '1.191', 'learning_rate': '4.969e-05', 'epoch': '0.345', 'num_input_tokens_seen': 28050041, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.33', 'grad_norm': '2.363', 'learning_rate': '4.969e-05', 'epoch': '0.345', 'num_input_tokens_seen': 28052088, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6641', 'grad_norm': '1.178', 'learning_rate': '4.969e-05', 'epoch': '0.3451', 'num_input_tokens_seen': 28054135, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.109', 'grad_norm': '2.021', 'learning_rate': '4.969e-05', 'epoch': '0.3451', 'num_input_tokens_seen': 28056182, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6418', 'grad_norm': '1.661', 'learning_rate': '4.969e-05', 'epoch': '0.3451', 'num_input_tokens_seen': 28058229, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.395', 'grad_norm': '2.158', 'learning_rate': '4.969e-05', 'epoch': '0.3451', 'num_input_tokens_seen': 28060276, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2965', 'grad_norm': '0.7544', 'learning_rate': '4.969e-05', 'epoch': '0.3452', 'num_input_tokens_seen': 28062323, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2853', 'grad_norm': '0.8148', 'learning_rate': '4.969e-05', 'epoch': '0.3452', 'num_input_tokens_seen': 28064370, 'train_runtime': '1.42e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6942', 'grad_norm': '1.398', 'learning_rate': '4.969e-05', 'epoch': '0.3452', 'num_input_tokens_seen': 28066417, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8732', 'grad_norm': '2.131', 'learning_rate': '4.969e-05', 'epoch': '0.3452', 'num_input_tokens_seen': 28068464, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3568', 'grad_norm': '0.919', 'learning_rate': '4.969e-05', 'epoch': '0.3453', 'num_input_tokens_seen': 28070511, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9009', 'grad_norm': '1.424', 'learning_rate': '4.969e-05', 'epoch': '0.3453', 'num_input_tokens_seen': 28072558, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.389', 'grad_norm': '2.362', 'learning_rate': '4.969e-05', 'epoch': '0.3453', 'num_input_tokens_seen': 28074605, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.024', 'grad_norm': '2.25', 'learning_rate': '4.969e-05', 'epoch': '0.3453', 'num_input_tokens_seen': 28076652, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6907', 'grad_norm': '1.813', 'learning_rate': '4.969e-05', 'epoch': '0.3454', 'num_input_tokens_seen': 28078699, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3206', 'grad_norm': '0.9214', 'learning_rate': '4.969e-05', 'epoch': '0.3454', 'num_input_tokens_seen': 28080746, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9774', 'grad_norm': '1.401', 'learning_rate': '4.969e-05', 'epoch': '0.3454', 'num_input_tokens_seen': 28082793, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.984', 'grad_norm': '2.242', 'learning_rate': '4.969e-05', 'epoch': '0.3454', 'num_input_tokens_seen': 28084840, 'train_runtime': '1.421e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.014', 'grad_norm': '1.995', 'learning_rate': '4.969e-05', 'epoch': '0.3455', 'num_input_tokens_seen': 28086887, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.982', 'grad_norm': '1.92', 'learning_rate': '4.969e-05', 'epoch': '0.3455', 'num_input_tokens_seen': 28088934, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7558', 'grad_norm': '1.441', 'learning_rate': '4.969e-05', 'epoch': '0.3455', 'num_input_tokens_seen': 28090981, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6961', 'grad_norm': '1.239', 'learning_rate': '4.969e-05', 'epoch': '0.3455', 'num_input_tokens_seen': 28093028, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4501', 'grad_norm': '1.465', 'learning_rate': '4.969e-05', 'epoch': '0.3456', 'num_input_tokens_seen': 28095075, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.167', 'grad_norm': '2.091', 'learning_rate': '4.969e-05', 'epoch': '0.3456', 'num_input_tokens_seen': 28097122, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2146', 'grad_norm': '0.7321', 'learning_rate': '4.969e-05', 'epoch': '0.3456', 'num_input_tokens_seen': 28099169, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8409', 'grad_norm': '1.539', 'learning_rate': '4.969e-05', 'epoch': '0.3456', 'num_input_tokens_seen': 28101216, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8287', 'grad_norm': '1.68', 'learning_rate': '4.969e-05', 'epoch': '0.3457', 'num_input_tokens_seen': 28103263, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3369', 'grad_norm': '0.7954', 'learning_rate': '4.969e-05', 'epoch': '0.3457', 'num_input_tokens_seen': 28105310, 'train_runtime': '1.422e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4108', 'grad_norm': '1.065', 'learning_rate': '4.969e-05', 'epoch': '0.3457', 'num_input_tokens_seen': 28107357, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8608', 'grad_norm': '1.074', 'learning_rate': '4.969e-05', 'epoch': '0.3457', 'num_input_tokens_seen': 28109404, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.232', 'grad_norm': '0.8167', 'learning_rate': '4.969e-05', 'epoch': '0.3458', 'num_input_tokens_seen': 28111451, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6737', 'grad_norm': '1.19', 'learning_rate': '4.969e-05', 'epoch': '0.3458', 'num_input_tokens_seen': 28113498, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.005', 'grad_norm': '2', 'learning_rate': '4.969e-05', 'epoch': '0.3458', 'num_input_tokens_seen': 28115545, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4572', 'grad_norm': '0.9696', 'learning_rate': '4.969e-05', 'epoch': '0.3458', 'num_input_tokens_seen': 28117592, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8152', 'grad_norm': '1.746', 'learning_rate': '4.969e-05', 'epoch': '0.3459', 'num_input_tokens_seen': 28119639, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7495', 'grad_norm': '1.048', 'learning_rate': '4.969e-05', 'epoch': '0.3459', 'num_input_tokens_seen': 28121686, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6147', 'grad_norm': '0.9192', 'learning_rate': '4.969e-05', 'epoch': '0.3459', 'num_input_tokens_seen': 28123733, 'train_runtime': '1.423e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.338', 'grad_norm': '2.343', 'learning_rate': '4.969e-05', 'epoch': '0.3459', 'num_input_tokens_seen': 28125780, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3679', 'grad_norm': '0.9527', 'learning_rate': '4.969e-05', 'epoch': '0.346', 'num_input_tokens_seen': 28127827, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6941', 'grad_norm': '1.593', 'learning_rate': '4.969e-05', 'epoch': '0.346', 'num_input_tokens_seen': 28129874, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3507', 'grad_norm': '1.011', 'learning_rate': '4.969e-05', 'epoch': '0.346', 'num_input_tokens_seen': 28131921, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.85', 'grad_norm': '1.005', 'learning_rate': '4.969e-05', 'epoch': '0.346', 'num_input_tokens_seen': 28133968, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.04', 'grad_norm': '2.061', 'learning_rate': '4.969e-05', 'epoch': '0.3461', 'num_input_tokens_seen': 28136015, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3111', 'grad_norm': '0.899', 'learning_rate': '4.969e-05', 'epoch': '0.3461', 'num_input_tokens_seen': 28138062, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.114', 'grad_norm': '1.471', 'learning_rate': '4.969e-05', 'epoch': '0.3461', 'num_input_tokens_seen': 28140109, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.711', 'grad_norm': '2.417', 'learning_rate': '4.969e-05', 'epoch': '0.3461', 'num_input_tokens_seen': 28142156, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2391', 'grad_norm': '0.7397', 'learning_rate': '4.969e-05', 'epoch': '0.3462', 'num_input_tokens_seen': 28144203, 'train_runtime': '1.424e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5778', 'grad_norm': '1.172', 'learning_rate': '4.969e-05', 'epoch': '0.3462', 'num_input_tokens_seen': 28146250, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.442', 'grad_norm': '2.591', 'learning_rate': '4.969e-05', 'epoch': '0.3462', 'num_input_tokens_seen': 28148297, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3359', 'grad_norm': '0.9657', 'learning_rate': '4.969e-05', 'epoch': '0.3462', 'num_input_tokens_seen': 28150344, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2328', 'grad_norm': '0.8731', 'learning_rate': '4.969e-05', 'epoch': '0.3463', 'num_input_tokens_seen': 28152391, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.358', 'grad_norm': '0.7491', 'learning_rate': '4.969e-05', 'epoch': '0.3463', 'num_input_tokens_seen': 28154438, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5166', 'grad_norm': '1.485', 'learning_rate': '4.969e-05', 'epoch': '0.3463', 'num_input_tokens_seen': 28156485, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.646', 'grad_norm': '2.203', 'learning_rate': '4.969e-05', 'epoch': '0.3464', 'num_input_tokens_seen': 28158532, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9501', 'grad_norm': '1.805', 'learning_rate': '4.969e-05', 'epoch': '0.3464', 'num_input_tokens_seen': 28160579, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8598', 'grad_norm': '1.174', 'learning_rate': '4.969e-05', 'epoch': '0.3464', 'num_input_tokens_seen': 28162626, 'train_runtime': '1.425e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.432', 'grad_norm': '2.342', 'learning_rate': '4.969e-05', 'epoch': '0.3464', 'num_input_tokens_seen': 28164673, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3892', 'grad_norm': '0.9534', 'learning_rate': '4.969e-05', 'epoch': '0.3465', 'num_input_tokens_seen': 28166720, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6349', 'grad_norm': '1.24', 'learning_rate': '4.969e-05', 'epoch': '0.3465', 'num_input_tokens_seen': 28168767, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6091', 'grad_norm': '1.212', 'learning_rate': '4.969e-05', 'epoch': '0.3465', 'num_input_tokens_seen': 28170814, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.622', 'grad_norm': '1.365', 'learning_rate': '4.969e-05', 'epoch': '0.3465', 'num_input_tokens_seen': 28172861, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7774', 'grad_norm': '1.26', 'learning_rate': '4.969e-05', 'epoch': '0.3466', 'num_input_tokens_seen': 28174908, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.006', 'grad_norm': '1.266', 'learning_rate': '4.969e-05', 'epoch': '0.3466', 'num_input_tokens_seen': 28176955, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2999', 'grad_norm': '0.7876', 'learning_rate': '4.969e-05', 'epoch': '0.3466', 'num_input_tokens_seen': 28179002, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.084', 'grad_norm': '1.76', 'learning_rate': '4.969e-05', 'epoch': '0.3466', 'num_input_tokens_seen': 28181049, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.483', 'grad_norm': '2.363', 'learning_rate': '4.969e-05', 'epoch': '0.3467', 'num_input_tokens_seen': 28183096, 'train_runtime': '1.426e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8368', 'grad_norm': '1.836', 'learning_rate': '4.969e-05', 'epoch': '0.3467', 'num_input_tokens_seen': 28185143, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2214', 'grad_norm': '0.793', 'learning_rate': '4.969e-05', 'epoch': '0.3467', 'num_input_tokens_seen': 28187190, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7859', 'grad_norm': '1.575', 'learning_rate': '4.969e-05', 'epoch': '0.3467', 'num_input_tokens_seen': 28189237, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.757', 'grad_norm': '1.986', 'learning_rate': '4.969e-05', 'epoch': '0.3468', 'num_input_tokens_seen': 28191284, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.263', 'grad_norm': '0.7813', 'learning_rate': '4.969e-05', 'epoch': '0.3468', 'num_input_tokens_seen': 28193331, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5888', 'grad_norm': '1.195', 'learning_rate': '4.969e-05', 'epoch': '0.3468', 'num_input_tokens_seen': 28195378, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.178', 'grad_norm': '2.57', 'learning_rate': '4.969e-05', 'epoch': '0.3468', 'num_input_tokens_seen': 28197425, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.247', 'grad_norm': '2.152', 'learning_rate': '4.969e-05', 'epoch': '0.3469', 'num_input_tokens_seen': 28199472, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5343', 'grad_norm': '1.069', 'learning_rate': '4.969e-05', 'epoch': '0.3469', 'num_input_tokens_seen': 28201519, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6001', 'grad_norm': '0.9386', 'learning_rate': '4.969e-05', 'epoch': '0.3469', 'num_input_tokens_seen': 28203566, 'train_runtime': '1.427e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.314', 'grad_norm': '1.01', 'learning_rate': '4.969e-05', 'epoch': '0.3469', 'num_input_tokens_seen': 28205613, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8038', 'grad_norm': '0.9875', 'learning_rate': '4.969e-05', 'epoch': '0.347', 'num_input_tokens_seen': 28207660, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.647', 'grad_norm': '1.275', 'learning_rate': '4.969e-05', 'epoch': '0.347', 'num_input_tokens_seen': 28209707, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7176', 'grad_norm': '1.098', 'learning_rate': '4.969e-05', 'epoch': '0.347', 'num_input_tokens_seen': 28211754, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6709', 'grad_norm': '1.093', 'learning_rate': '4.969e-05', 'epoch': '0.347', 'num_input_tokens_seen': 28213801, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.055', 'grad_norm': '1.869', 'learning_rate': '4.969e-05', 'epoch': '0.3471', 'num_input_tokens_seen': 28215848, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.436', 'grad_norm': '2.525', 'learning_rate': '4.969e-05', 'epoch': '0.3471', 'num_input_tokens_seen': 28217895, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.759', 'grad_norm': '2.509', 'learning_rate': '4.969e-05', 'epoch': '0.3471', 'num_input_tokens_seen': 28219942, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5067', 'grad_norm': '1.057', 'learning_rate': '4.969e-05', 'epoch': '0.3471', 'num_input_tokens_seen': 28221989, 'train_runtime': '1.428e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1357', 'grad_norm': '0.7479', 'learning_rate': '4.969e-05', 'epoch': '0.3472', 'num_input_tokens_seen': 28224036, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.291', 'grad_norm': '2.303', 'learning_rate': '4.969e-05', 'epoch': '0.3472', 'num_input_tokens_seen': 28226083, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.525', 'grad_norm': '2.219', 'learning_rate': '4.969e-05', 'epoch': '0.3472', 'num_input_tokens_seen': 28228130, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.22', 'grad_norm': '0.9122', 'learning_rate': '4.969e-05', 'epoch': '0.3472', 'num_input_tokens_seen': 28230177, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2605', 'grad_norm': '1.009', 'learning_rate': '4.969e-05', 'epoch': '0.3473', 'num_input_tokens_seen': 28232224, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.16', 'grad_norm': '1.934', 'learning_rate': '4.969e-05', 'epoch': '0.3473', 'num_input_tokens_seen': 28234271, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5357', 'grad_norm': '1.393', 'learning_rate': '4.969e-05', 'epoch': '0.3473', 'num_input_tokens_seen': 28236318, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3621', 'grad_norm': '1.156', 'learning_rate': '4.969e-05', 'epoch': '0.3473', 'num_input_tokens_seen': 28238365, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2904', 'grad_norm': '0.9572', 'learning_rate': '4.969e-05', 'epoch': '0.3474', 'num_input_tokens_seen': 28240412, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6436', 'grad_norm': '1.458', 'learning_rate': '4.969e-05', 'epoch': '0.3474', 'num_input_tokens_seen': 28242459, 'train_runtime': '1.429e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4714', 'grad_norm': '1.158', 'learning_rate': '4.969e-05', 'epoch': '0.3474', 'num_input_tokens_seen': 28244506, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2442', 'grad_norm': '0.8333', 'learning_rate': '4.969e-05', 'epoch': '0.3474', 'num_input_tokens_seen': 28246553, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2823', 'grad_norm': '0.9081', 'learning_rate': '4.969e-05', 'epoch': '0.3475', 'num_input_tokens_seen': 28248600, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.069', 'grad_norm': '1.513', 'learning_rate': '4.969e-05', 'epoch': '0.3475', 'num_input_tokens_seen': 28250647, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3364', 'grad_norm': '0.7974', 'learning_rate': '4.969e-05', 'epoch': '0.3475', 'num_input_tokens_seen': 28252694, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.199', 'grad_norm': '2.311', 'learning_rate': '4.969e-05', 'epoch': '0.3475', 'num_input_tokens_seen': 28254741, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.599', 'grad_norm': '2.454', 'learning_rate': '4.969e-05', 'epoch': '0.3476', 'num_input_tokens_seen': 28256788, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.532', 'grad_norm': '1.396', 'learning_rate': '4.969e-05', 'epoch': '0.3476', 'num_input_tokens_seen': 28258835, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6927', 'grad_norm': '1.497', 'learning_rate': '4.969e-05', 'epoch': '0.3476', 'num_input_tokens_seen': 28260882, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9565', 'grad_norm': '1.539', 'learning_rate': '4.969e-05', 'epoch': '0.3476', 'num_input_tokens_seen': 28262929, 'train_runtime': '1.43e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5543', 'grad_norm': '1.357', 'learning_rate': '4.969e-05', 'epoch': '0.3477', 'num_input_tokens_seen': 28264976, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2302', 'grad_norm': '0.7608', 'learning_rate': '4.969e-05', 'epoch': '0.3477', 'num_input_tokens_seen': 28267023, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7626', 'grad_norm': '2.131', 'learning_rate': '4.969e-05', 'epoch': '0.3477', 'num_input_tokens_seen': 28269070, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5383', 'grad_norm': '1.329', 'learning_rate': '4.969e-05', 'epoch': '0.3477', 'num_input_tokens_seen': 28271117, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3663', 'grad_norm': '0.8397', 'learning_rate': '4.969e-05', 'epoch': '0.3478', 'num_input_tokens_seen': 28273164, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.185', 'grad_norm': '0.8345', 'learning_rate': '4.969e-05', 'epoch': '0.3478', 'num_input_tokens_seen': 28275211, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3317', 'grad_norm': '0.8138', 'learning_rate': '4.969e-05', 'epoch': '0.3478', 'num_input_tokens_seen': 28277258, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3842', 'grad_norm': '1.039', 'learning_rate': '4.969e-05', 'epoch': '0.3478', 'num_input_tokens_seen': 28279305, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9436', 'grad_norm': '1.66', 'learning_rate': '4.969e-05', 'epoch': '0.3479', 'num_input_tokens_seen': 28281352, 'train_runtime': '1.431e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.489', 'grad_norm': '2.179', 'learning_rate': '4.969e-05', 'epoch': '0.3479', 'num_input_tokens_seen': 28283399, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.861', 'grad_norm': '1.901', 'learning_rate': '4.968e-05', 'epoch': '0.3479', 'num_input_tokens_seen': 28285446, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.566', 'grad_norm': '1.195', 'learning_rate': '4.968e-05', 'epoch': '0.3479', 'num_input_tokens_seen': 28287493, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.114', 'grad_norm': '1.658', 'learning_rate': '4.968e-05', 'epoch': '0.348', 'num_input_tokens_seen': 28289540, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.038', 'grad_norm': '1.866', 'learning_rate': '4.968e-05', 'epoch': '0.348', 'num_input_tokens_seen': 28291587, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.693', 'grad_norm': '3.172', 'learning_rate': '4.968e-05', 'epoch': '0.348', 'num_input_tokens_seen': 28293634, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3572', 'grad_norm': '0.8996', 'learning_rate': '4.968e-05', 'epoch': '0.348', 'num_input_tokens_seen': 28295681, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1847', 'grad_norm': '0.7158', 'learning_rate': '4.968e-05', 'epoch': '0.3481', 'num_input_tokens_seen': 28297728, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.596', 'grad_norm': '1.307', 'learning_rate': '4.968e-05', 'epoch': '0.3481', 'num_input_tokens_seen': 28299775, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6046', 'grad_norm': '1.149', 'learning_rate': '4.968e-05', 'epoch': '0.3481', 'num_input_tokens_seen': 28301822, 'train_runtime': '1.432e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4222', 'grad_norm': '0.8984', 'learning_rate': '4.968e-05', 'epoch': '0.3481', 'num_input_tokens_seen': 28303869, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.827', 'grad_norm': '3.461', 'learning_rate': '4.968e-05', 'epoch': '0.3482', 'num_input_tokens_seen': 28305916, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8623', 'grad_norm': '1.302', 'learning_rate': '4.968e-05', 'epoch': '0.3482', 'num_input_tokens_seen': 28307963, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7951', 'grad_norm': '1.309', 'learning_rate': '4.968e-05', 'epoch': '0.3482', 'num_input_tokens_seen': 28310010, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.016', 'grad_norm': '1.624', 'learning_rate': '4.968e-05', 'epoch': '0.3482', 'num_input_tokens_seen': 28312057, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6032', 'grad_norm': '0.905', 'learning_rate': '4.968e-05', 'epoch': '0.3483', 'num_input_tokens_seen': 28314104, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6104', 'grad_norm': '1.078', 'learning_rate': '4.968e-05', 'epoch': '0.3483', 'num_input_tokens_seen': 28316151, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8269', 'grad_norm': '1.403', 'learning_rate': '4.968e-05', 'epoch': '0.3483', 'num_input_tokens_seen': 28318198, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3551', 'grad_norm': '0.9159', 'learning_rate': '4.968e-05', 'epoch': '0.3483', 'num_input_tokens_seen': 28320245, 'train_runtime': '1.433e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.311', 'grad_norm': '2.485', 'learning_rate': '4.968e-05', 'epoch': '0.3484', 'num_input_tokens_seen': 28322292, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5744', 'grad_norm': '1.115', 'learning_rate': '4.968e-05', 'epoch': '0.3484', 'num_input_tokens_seen': 28324339, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2608', 'grad_norm': '0.9481', 'learning_rate': '4.968e-05', 'epoch': '0.3484', 'num_input_tokens_seen': 28326386, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6575', 'grad_norm': '1.037', 'learning_rate': '4.968e-05', 'epoch': '0.3484', 'num_input_tokens_seen': 28328433, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6597', 'grad_norm': '1.131', 'learning_rate': '4.968e-05', 'epoch': '0.3485', 'num_input_tokens_seen': 28330480, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4547', 'grad_norm': '1.253', 'learning_rate': '4.968e-05', 'epoch': '0.3485', 'num_input_tokens_seen': 28332527, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.359', 'grad_norm': '2.014', 'learning_rate': '4.968e-05', 'epoch': '0.3485', 'num_input_tokens_seen': 28334574, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6105', 'grad_norm': '1.267', 'learning_rate': '4.968e-05', 'epoch': '0.3485', 'num_input_tokens_seen': 28336621, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6555', 'grad_norm': '1.176', 'learning_rate': '4.968e-05', 'epoch': '0.3486', 'num_input_tokens_seen': 28338668, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.341', 'grad_norm': '1.039', 'learning_rate': '4.968e-05', 'epoch': '0.3486', 'num_input_tokens_seen': 28340715, 'train_runtime': '1.434e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6157', 'grad_norm': '1.292', 'learning_rate': '4.968e-05', 'epoch': '0.3486', 'num_input_tokens_seen': 28342762, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3994', 'grad_norm': '0.9945', 'learning_rate': '4.968e-05', 'epoch': '0.3486', 'num_input_tokens_seen': 28344809, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1672', 'grad_norm': '0.8153', 'learning_rate': '4.968e-05', 'epoch': '0.3487', 'num_input_tokens_seen': 28346856, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.636', 'grad_norm': '2.45', 'learning_rate': '4.968e-05', 'epoch': '0.3487', 'num_input_tokens_seen': 28348903, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9021', 'grad_norm': '1.948', 'learning_rate': '4.968e-05', 'epoch': '0.3487', 'num_input_tokens_seen': 28350950, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.447', 'grad_norm': '1.038', 'learning_rate': '4.968e-05', 'epoch': '0.3487', 'num_input_tokens_seen': 28352997, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3645', 'grad_norm': '1.293', 'learning_rate': '4.968e-05', 'epoch': '0.3488', 'num_input_tokens_seen': 28355044, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3217', 'grad_norm': '0.7598', 'learning_rate': '4.968e-05', 'epoch': '0.3488', 'num_input_tokens_seen': 28357091, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4024', 'grad_norm': '0.9096', 'learning_rate': '4.968e-05', 'epoch': '0.3488', 'num_input_tokens_seen': 28359138, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5569', 'grad_norm': '1.445', 'learning_rate': '4.968e-05', 'epoch': '0.3488', 'num_input_tokens_seen': 28361185, 'train_runtime': '1.435e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7219', 'grad_norm': '1.246', 'learning_rate': '4.968e-05', 'epoch': '0.3489', 'num_input_tokens_seen': 28363232, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.321', 'grad_norm': '2.462', 'learning_rate': '4.968e-05', 'epoch': '0.3489', 'num_input_tokens_seen': 28365279, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5526', 'grad_norm': '1.523', 'learning_rate': '4.968e-05', 'epoch': '0.3489', 'num_input_tokens_seen': 28367326, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2177', 'grad_norm': '0.863', 'learning_rate': '4.968e-05', 'epoch': '0.3489', 'num_input_tokens_seen': 28369373, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5817', 'grad_norm': '1.642', 'learning_rate': '4.968e-05', 'epoch': '0.349', 'num_input_tokens_seen': 28371420, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4586', 'grad_norm': '0.9539', 'learning_rate': '4.968e-05', 'epoch': '0.349', 'num_input_tokens_seen': 28373467, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9144', 'grad_norm': '1.607', 'learning_rate': '4.968e-05', 'epoch': '0.349', 'num_input_tokens_seen': 28375514, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.939', 'grad_norm': '2.653', 'learning_rate': '4.968e-05', 'epoch': '0.349', 'num_input_tokens_seen': 28377561, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6477', 'grad_norm': '1.508', 'learning_rate': '4.968e-05', 'epoch': '0.3491', 'num_input_tokens_seen': 28379608, 'train_runtime': '1.436e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8618', 'grad_norm': '1.265', 'learning_rate': '4.968e-05', 'epoch': '0.3491', 'num_input_tokens_seen': 28381655, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6182', 'grad_norm': '1.302', 'learning_rate': '4.968e-05', 'epoch': '0.3491', 'num_input_tokens_seen': 28383702, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6445', 'grad_norm': '1.239', 'learning_rate': '4.968e-05', 'epoch': '0.3491', 'num_input_tokens_seen': 28385749, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5779', 'grad_norm': '1.317', 'learning_rate': '4.968e-05', 'epoch': '0.3492', 'num_input_tokens_seen': 28387796, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.145', 'grad_norm': '2.054', 'learning_rate': '4.968e-05', 'epoch': '0.3492', 'num_input_tokens_seen': 28389843, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4272', 'grad_norm': '1.095', 'learning_rate': '4.968e-05', 'epoch': '0.3492', 'num_input_tokens_seen': 28391890, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2743', 'grad_norm': '0.8876', 'learning_rate': '4.968e-05', 'epoch': '0.3492', 'num_input_tokens_seen': 28393937, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5652', 'grad_norm': '1.01', 'learning_rate': '4.968e-05', 'epoch': '0.3493', 'num_input_tokens_seen': 28395984, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.681', 'grad_norm': '2.895', 'learning_rate': '4.968e-05', 'epoch': '0.3493', 'num_input_tokens_seen': 28398031, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.105', 'grad_norm': '2.418', 'learning_rate': '4.968e-05', 'epoch': '0.3493', 'num_input_tokens_seen': 28400078, 'train_runtime': '1.437e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4325', 'grad_norm': '1.068', 'learning_rate': '4.968e-05', 'epoch': '0.3493', 'num_input_tokens_seen': 28402125, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.082', 'grad_norm': '1.482', 'learning_rate': '4.968e-05', 'epoch': '0.3494', 'num_input_tokens_seen': 28404172, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.523', 'grad_norm': '2.705', 'learning_rate': '4.968e-05', 'epoch': '0.3494', 'num_input_tokens_seen': 28406219, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6621', 'grad_norm': '1.497', 'learning_rate': '4.968e-05', 'epoch': '0.3494', 'num_input_tokens_seen': 28408266, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4827', 'grad_norm': '1.076', 'learning_rate': '4.968e-05', 'epoch': '0.3494', 'num_input_tokens_seen': 28410313, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6087', 'grad_norm': '1.266', 'learning_rate': '4.968e-05', 'epoch': '0.3495', 'num_input_tokens_seen': 28412360, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8021', 'grad_norm': '1.29', 'learning_rate': '4.968e-05', 'epoch': '0.3495', 'num_input_tokens_seen': 28414407, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4928', 'grad_norm': '0.9904', 'learning_rate': '4.968e-05', 'epoch': '0.3495', 'num_input_tokens_seen': 28416454, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5968', 'grad_norm': '1.104', 'learning_rate': '4.968e-05', 'epoch': '0.3495', 'num_input_tokens_seen': 28418501, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4243', 'grad_norm': '1.1', 'learning_rate': '4.968e-05', 'epoch': '0.3496', 'num_input_tokens_seen': 28420548, 'train_runtime': '1.438e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3776', 'grad_norm': '0.8462', 'learning_rate': '4.968e-05', 'epoch': '0.3496', 'num_input_tokens_seen': 28422595, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4103', 'grad_norm': '0.8567', 'learning_rate': '4.968e-05', 'epoch': '0.3496', 'num_input_tokens_seen': 28424642, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.681', 'grad_norm': '2.385', 'learning_rate': '4.968e-05', 'epoch': '0.3496', 'num_input_tokens_seen': 28426689, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.037', 'grad_norm': '1.638', 'learning_rate': '4.968e-05', 'epoch': '0.3497', 'num_input_tokens_seen': 28428736, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6062', 'grad_norm': '1.077', 'learning_rate': '4.968e-05', 'epoch': '0.3497', 'num_input_tokens_seen': 28430783, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6344', 'grad_norm': '1.691', 'learning_rate': '4.968e-05', 'epoch': '0.3497', 'num_input_tokens_seen': 28432830, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.651', 'grad_norm': '1.582', 'learning_rate': '4.968e-05', 'epoch': '0.3497', 'num_input_tokens_seen': 28434877, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.044', 'grad_norm': '1.839', 'learning_rate': '4.968e-05', 'epoch': '0.3498', 'num_input_tokens_seen': 28436924, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5076', 'grad_norm': '1.199', 'learning_rate': '4.968e-05', 'epoch': '0.3498', 'num_input_tokens_seen': 28438971, 'train_runtime': '1.439e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6209', 'grad_norm': '1.235', 'learning_rate': '4.968e-05', 'epoch': '0.3498', 'num_input_tokens_seen': 28441018, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6137', 'grad_norm': '1.437', 'learning_rate': '4.968e-05', 'epoch': '0.3499', 'num_input_tokens_seen': 28443065, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5957', 'grad_norm': '1.232', 'learning_rate': '4.968e-05', 'epoch': '0.3499', 'num_input_tokens_seen': 28445112, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.347', 'grad_norm': '2.076', 'learning_rate': '4.968e-05', 'epoch': '0.3499', 'num_input_tokens_seen': 28447159, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5692', 'grad_norm': '1.33', 'learning_rate': '4.968e-05', 'epoch': '0.3499', 'num_input_tokens_seen': 28449206, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.322', 'grad_norm': '2.393', 'learning_rate': '4.968e-05', 'epoch': '0.35', 'num_input_tokens_seen': 28451253, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5525', 'grad_norm': '1.295', 'learning_rate': '4.968e-05', 'epoch': '0.35', 'num_input_tokens_seen': 28453300, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5237', 'grad_norm': '1.182', 'learning_rate': '4.968e-05', 'epoch': '0.35', 'num_input_tokens_seen': 28455347, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5311', 'grad_norm': '1.051', 'learning_rate': '4.968e-05', 'epoch': '0.35', 'num_input_tokens_seen': 28457394, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.237', 'grad_norm': '2.352', 'learning_rate': '4.968e-05', 'epoch': '0.3501', 'num_input_tokens_seen': 28459441, 'train_runtime': '1.44e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1904', 'grad_norm': '0.8702', 'learning_rate': '4.968e-05', 'epoch': '0.3501', 'num_input_tokens_seen': 28461488, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7416', 'grad_norm': '1.299', 'learning_rate': '4.968e-05', 'epoch': '0.3501', 'num_input_tokens_seen': 28463535, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5397', 'grad_norm': '1.499', 'learning_rate': '4.968e-05', 'epoch': '0.3501', 'num_input_tokens_seen': 28465582, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.447', 'grad_norm': '2.189', 'learning_rate': '4.968e-05', 'epoch': '0.3502', 'num_input_tokens_seen': 28467629, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2614', 'grad_norm': '0.8127', 'learning_rate': '4.968e-05', 'epoch': '0.3502', 'num_input_tokens_seen': 28469676, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3827', 'grad_norm': '0.8099', 'learning_rate': '4.968e-05', 'epoch': '0.3502', 'num_input_tokens_seen': 28471723, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7399', 'grad_norm': '1.864', 'learning_rate': '4.968e-05', 'epoch': '0.3502', 'num_input_tokens_seen': 28473770, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3162', 'grad_norm': '0.9613', 'learning_rate': '4.968e-05', 'epoch': '0.3503', 'num_input_tokens_seen': 28475817, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.187', 'grad_norm': '2.036', 'learning_rate': '4.968e-05', 'epoch': '0.3503', 'num_input_tokens_seen': 28477864, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.482', 'grad_norm': '2.274', 'learning_rate': '4.968e-05', 'epoch': '0.3503', 'num_input_tokens_seen': 28479911, 'train_runtime': '1.441e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.95', 'grad_norm': '1.707', 'learning_rate': '4.968e-05', 'epoch': '0.3503', 'num_input_tokens_seen': 28481958, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3827', 'grad_norm': '0.8185', 'learning_rate': '4.968e-05', 'epoch': '0.3504', 'num_input_tokens_seen': 28484005, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8668', 'grad_norm': '1.8', 'learning_rate': '4.968e-05', 'epoch': '0.3504', 'num_input_tokens_seen': 28486052, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9803', 'grad_norm': '2.142', 'learning_rate': '4.968e-05', 'epoch': '0.3504', 'num_input_tokens_seen': 28488099, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7419', 'grad_norm': '1.242', 'learning_rate': '4.968e-05', 'epoch': '0.3504', 'num_input_tokens_seen': 28490146, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.247', 'grad_norm': '0.8415', 'learning_rate': '4.968e-05', 'epoch': '0.3505', 'num_input_tokens_seen': 28492193, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.286', 'grad_norm': '0.9688', 'learning_rate': '4.968e-05', 'epoch': '0.3505', 'num_input_tokens_seen': 28494240, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2993', 'grad_norm': '0.9525', 'learning_rate': '4.968e-05', 'epoch': '0.3505', 'num_input_tokens_seen': 28496287, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.222', 'grad_norm': '1.829', 'learning_rate': '4.968e-05', 'epoch': '0.3505', 'num_input_tokens_seen': 28498334, 'train_runtime': '1.442e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1842', 'grad_norm': '0.7755', 'learning_rate': '4.968e-05', 'epoch': '0.3506', 'num_input_tokens_seen': 28500381, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.708', 'grad_norm': '1.295', 'learning_rate': '4.968e-05', 'epoch': '0.3506', 'num_input_tokens_seen': 28502428, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5804', 'grad_norm': '1.239', 'learning_rate': '4.968e-05', 'epoch': '0.3506', 'num_input_tokens_seen': 28504475, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5659', 'grad_norm': '1.148', 'learning_rate': '4.968e-05', 'epoch': '0.3506', 'num_input_tokens_seen': 28506522, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3045', 'grad_norm': '0.8623', 'learning_rate': '4.968e-05', 'epoch': '0.3507', 'num_input_tokens_seen': 28508569, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5289', 'grad_norm': '1.138', 'learning_rate': '4.968e-05', 'epoch': '0.3507', 'num_input_tokens_seen': 28510616, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3349', 'grad_norm': '0.8753', 'learning_rate': '4.968e-05', 'epoch': '0.3507', 'num_input_tokens_seen': 28512663, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2197', 'grad_norm': '0.8255', 'learning_rate': '4.968e-05', 'epoch': '0.3507', 'num_input_tokens_seen': 28514710, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7468', 'grad_norm': '1.413', 'learning_rate': '4.968e-05', 'epoch': '0.3508', 'num_input_tokens_seen': 28516757, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5967', 'grad_norm': '1.328', 'learning_rate': '4.968e-05', 'epoch': '0.3508', 'num_input_tokens_seen': 28518804, 'train_runtime': '1.443e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6759', 'grad_norm': '1.155', 'learning_rate': '4.968e-05', 'epoch': '0.3508', 'num_input_tokens_seen': 28520851, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9911', 'grad_norm': '2.238', 'learning_rate': '4.968e-05', 'epoch': '0.3508', 'num_input_tokens_seen': 28522898, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1805', 'grad_norm': '0.728', 'learning_rate': '4.968e-05', 'epoch': '0.3509', 'num_input_tokens_seen': 28524945, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8804', 'grad_norm': '1.385', 'learning_rate': '4.968e-05', 'epoch': '0.3509', 'num_input_tokens_seen': 28526992, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8995', 'grad_norm': '1.207', 'learning_rate': '4.968e-05', 'epoch': '0.3509', 'num_input_tokens_seen': 28529039, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.273', 'grad_norm': '2.628', 'learning_rate': '4.968e-05', 'epoch': '0.3509', 'num_input_tokens_seen': 28531086, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3791', 'grad_norm': '0.9656', 'learning_rate': '4.968e-05', 'epoch': '0.351', 'num_input_tokens_seen': 28533133, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.51', 'grad_norm': '2.743', 'learning_rate': '4.968e-05', 'epoch': '0.351', 'num_input_tokens_seen': 28535180, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7956', 'grad_norm': '1.628', 'learning_rate': '4.968e-05', 'epoch': '0.351', 'num_input_tokens_seen': 28537227, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5506', 'grad_norm': '1.274', 'learning_rate': '4.968e-05', 'epoch': '0.351', 'num_input_tokens_seen': 28539274, 'train_runtime': '1.444e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.533', 'grad_norm': '1.179', 'learning_rate': '4.968e-05', 'epoch': '0.3511', 'num_input_tokens_seen': 28541321, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.145', 'grad_norm': '1.831', 'learning_rate': '4.968e-05', 'epoch': '0.3511', 'num_input_tokens_seen': 28543368, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3894', 'grad_norm': '1.209', 'learning_rate': '4.968e-05', 'epoch': '0.3511', 'num_input_tokens_seen': 28545415, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8557', 'grad_norm': '1.285', 'learning_rate': '4.968e-05', 'epoch': '0.3511', 'num_input_tokens_seen': 28547462, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.005', 'grad_norm': '1.846', 'learning_rate': '4.968e-05', 'epoch': '0.3512', 'num_input_tokens_seen': 28549509, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3241', 'grad_norm': '0.8984', 'learning_rate': '4.968e-05', 'epoch': '0.3512', 'num_input_tokens_seen': 28551556, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8807', 'grad_norm': '1.895', 'learning_rate': '4.968e-05', 'epoch': '0.3512', 'num_input_tokens_seen': 28553603, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9548', 'grad_norm': '2.063', 'learning_rate': '4.968e-05', 'epoch': '0.3512', 'num_input_tokens_seen': 28555650, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.562', 'grad_norm': '1.14', 'learning_rate': '4.968e-05', 'epoch': '0.3513', 'num_input_tokens_seen': 28557697, 'train_runtime': '1.445e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.109', 'grad_norm': '2.058', 'learning_rate': '4.968e-05', 'epoch': '0.3513', 'num_input_tokens_seen': 28559744, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2057', 'grad_norm': '0.7591', 'learning_rate': '4.968e-05', 'epoch': '0.3513', 'num_input_tokens_seen': 28561791, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2626', 'grad_norm': '0.8677', 'learning_rate': '4.968e-05', 'epoch': '0.3513', 'num_input_tokens_seen': 28563838, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4108', 'grad_norm': '0.9778', 'learning_rate': '4.968e-05', 'epoch': '0.3514', 'num_input_tokens_seen': 28565885, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7802', 'grad_norm': '1.496', 'learning_rate': '4.968e-05', 'epoch': '0.3514', 'num_input_tokens_seen': 28567932, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3347', 'grad_norm': '0.844', 'learning_rate': '4.968e-05', 'epoch': '0.3514', 'num_input_tokens_seen': 28569979, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.019', 'grad_norm': '1.716', 'learning_rate': '4.968e-05', 'epoch': '0.3514', 'num_input_tokens_seen': 28572026, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.108', 'grad_norm': '1.903', 'learning_rate': '4.968e-05', 'epoch': '0.3515', 'num_input_tokens_seen': 28574073, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6918', 'grad_norm': '1.802', 'learning_rate': '4.968e-05', 'epoch': '0.3515', 'num_input_tokens_seen': 28576120, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3905', 'grad_norm': '0.7856', 'learning_rate': '4.968e-05', 'epoch': '0.3515', 'num_input_tokens_seen': 28578167, 'train_runtime': '1.446e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.867', 'grad_norm': '1.197', 'learning_rate': '4.968e-05', 'epoch': '0.3515', 'num_input_tokens_seen': 28580214, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2947', 'grad_norm': '0.9195', 'learning_rate': '4.968e-05', 'epoch': '0.3516', 'num_input_tokens_seen': 28582261, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1978', 'grad_norm': '0.8624', 'learning_rate': '4.968e-05', 'epoch': '0.3516', 'num_input_tokens_seen': 28584308, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.047', 'grad_norm': '2.145', 'learning_rate': '4.968e-05', 'epoch': '0.3516', 'num_input_tokens_seen': 28586355, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6805', 'grad_norm': '1.253', 'learning_rate': '4.968e-05', 'epoch': '0.3516', 'num_input_tokens_seen': 28588402, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5691', 'grad_norm': '0.9103', 'learning_rate': '4.968e-05', 'epoch': '0.3517', 'num_input_tokens_seen': 28590449, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.275', 'grad_norm': '2.192', 'learning_rate': '4.968e-05', 'epoch': '0.3517', 'num_input_tokens_seen': 28592496, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2031', 'grad_norm': '0.7144', 'learning_rate': '4.968e-05', 'epoch': '0.3517', 'num_input_tokens_seen': 28594543, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2571', 'grad_norm': '0.8489', 'learning_rate': '4.968e-05', 'epoch': '0.3517', 'num_input_tokens_seen': 28596590, 'train_runtime': '1.447e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3619', 'grad_norm': '1.048', 'learning_rate': '4.968e-05', 'epoch': '0.3518', 'num_input_tokens_seen': 28598637, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7546', 'grad_norm': '1.923', 'learning_rate': '4.968e-05', 'epoch': '0.3518', 'num_input_tokens_seen': 28600684, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.595', 'grad_norm': '3.12', 'learning_rate': '4.967e-05', 'epoch': '0.3518', 'num_input_tokens_seen': 28602731, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6669', 'grad_norm': '1.134', 'learning_rate': '4.967e-05', 'epoch': '0.3518', 'num_input_tokens_seen': 28604778, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7227', 'grad_norm': '1.296', 'learning_rate': '4.967e-05', 'epoch': '0.3519', 'num_input_tokens_seen': 28606825, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.726', 'grad_norm': '2.51', 'learning_rate': '4.967e-05', 'epoch': '0.3519', 'num_input_tokens_seen': 28608872, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9362', 'grad_norm': '1.797', 'learning_rate': '4.967e-05', 'epoch': '0.3519', 'num_input_tokens_seen': 28610919, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3932', 'grad_norm': '1.041', 'learning_rate': '4.967e-05', 'epoch': '0.3519', 'num_input_tokens_seen': 28612966, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9823', 'grad_norm': '1.715', 'learning_rate': '4.967e-05', 'epoch': '0.352', 'num_input_tokens_seen': 28615013, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7144', 'grad_norm': '2.024', 'learning_rate': '4.967e-05', 'epoch': '0.352', 'num_input_tokens_seen': 28617060, 'train_runtime': '1.448e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7697', 'grad_norm': '1.417', 'learning_rate': '4.967e-05', 'epoch': '0.352', 'num_input_tokens_seen': 28619107, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6555', 'grad_norm': '1.303', 'learning_rate': '4.967e-05', 'epoch': '0.352', 'num_input_tokens_seen': 28621154, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6669', 'grad_norm': '0.9097', 'learning_rate': '4.967e-05', 'epoch': '0.3521', 'num_input_tokens_seen': 28623201, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.066', 'grad_norm': '2.459', 'learning_rate': '4.967e-05', 'epoch': '0.3521', 'num_input_tokens_seen': 28625248, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2577', 'grad_norm': '0.8169', 'learning_rate': '4.967e-05', 'epoch': '0.3521', 'num_input_tokens_seen': 28627295, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3307', 'grad_norm': '0.8239', 'learning_rate': '4.967e-05', 'epoch': '0.3521', 'num_input_tokens_seen': 28629342, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7957', 'grad_norm': '1.115', 'learning_rate': '4.967e-05', 'epoch': '0.3522', 'num_input_tokens_seen': 28631389, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.192', 'grad_norm': '2.134', 'learning_rate': '4.967e-05', 'epoch': '0.3522', 'num_input_tokens_seen': 28633436, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3898', 'grad_norm': '0.8634', 'learning_rate': '4.967e-05', 'epoch': '0.3522', 'num_input_tokens_seen': 28635483, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.122', 'grad_norm': '1.646', 'learning_rate': '4.967e-05', 'epoch': '0.3522', 'num_input_tokens_seen': 28637530, 'train_runtime': '1.449e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9554', 'grad_norm': '1.699', 'learning_rate': '4.967e-05', 'epoch': '0.3523', 'num_input_tokens_seen': 28639577, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9372', 'grad_norm': '1.842', 'learning_rate': '4.967e-05', 'epoch': '0.3523', 'num_input_tokens_seen': 28641624, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2044', 'grad_norm': '0.9563', 'learning_rate': '4.967e-05', 'epoch': '0.3523', 'num_input_tokens_seen': 28643671, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5393', 'grad_norm': '1.675', 'learning_rate': '4.967e-05', 'epoch': '0.3523', 'num_input_tokens_seen': 28645718, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3771', 'grad_norm': '0.939', 'learning_rate': '4.967e-05', 'epoch': '0.3524', 'num_input_tokens_seen': 28647765, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.073', 'grad_norm': '2.048', 'learning_rate': '4.967e-05', 'epoch': '0.3524', 'num_input_tokens_seen': 28649812, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5799', 'grad_norm': '1.628', 'learning_rate': '4.967e-05', 'epoch': '0.3524', 'num_input_tokens_seen': 28651859, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.355', 'grad_norm': '3.241', 'learning_rate': '4.967e-05', 'epoch': '0.3524', 'num_input_tokens_seen': 28653906, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2508', 'grad_norm': '0.7805', 'learning_rate': '4.967e-05', 'epoch': '0.3525', 'num_input_tokens_seen': 28655953, 'train_runtime': '1.45e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.213', 'grad_norm': '2.275', 'learning_rate': '4.967e-05', 'epoch': '0.3525', 'num_input_tokens_seen': 28658000, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +[INFO|configuration_utils.py:665] 2026-02-05 06:39:10,939 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 06:39:10,939 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 06:39:11,403 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-14000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 06:39:11,412 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-14000/tokenizer_config.json + +{'loss': '0.3644', 'grad_norm': '0.8953', 'learning_rate': '4.967e-05', 'epoch': '0.3525', 'num_input_tokens_seen': 28660047, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9644', 'grad_norm': '1.791', 'learning_rate': '4.967e-05', 'epoch': '0.3525', 'num_input_tokens_seen': 28662094, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6496', 'grad_norm': '1.204', 'learning_rate': '4.967e-05', 'epoch': '0.3526', 'num_input_tokens_seen': 28664141, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7398', 'grad_norm': '1.144', 'learning_rate': '4.967e-05', 'epoch': '0.3526', 'num_input_tokens_seen': 28666188, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5032', 'grad_norm': '0.9937', 'learning_rate': '4.967e-05', 'epoch': '0.3526', 'num_input_tokens_seen': 28668235, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2725', 'grad_norm': '0.8687', 'learning_rate': '4.967e-05', 'epoch': '0.3526', 'num_input_tokens_seen': 28670282, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4737', 'grad_norm': '0.994', 'learning_rate': '4.967e-05', 'epoch': '0.3527', 'num_input_tokens_seen': 28672329, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.312', 'grad_norm': '2.689', 'learning_rate': '4.967e-05', 'epoch': '0.3527', 'num_input_tokens_seen': 28674376, 'train_runtime': '1.451e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3909', 'grad_norm': '1.076', 'learning_rate': '4.967e-05', 'epoch': '0.3527', 'num_input_tokens_seen': 28676423, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3303', 'grad_norm': '1.037', 'learning_rate': '4.967e-05', 'epoch': '0.3527', 'num_input_tokens_seen': 28678470, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4782', 'grad_norm': '1.17', 'learning_rate': '4.967e-05', 'epoch': '0.3528', 'num_input_tokens_seen': 28680517, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.791', 'grad_norm': '1.278', 'learning_rate': '4.967e-05', 'epoch': '0.3528', 'num_input_tokens_seen': 28682564, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3393', 'grad_norm': '1.076', 'learning_rate': '4.967e-05', 'epoch': '0.3528', 'num_input_tokens_seen': 28684611, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7949', 'grad_norm': '2.01', 'learning_rate': '4.967e-05', 'epoch': '0.3528', 'num_input_tokens_seen': 28686658, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5909', 'grad_norm': '0.9313', 'learning_rate': '4.967e-05', 'epoch': '0.3529', 'num_input_tokens_seen': 28688705, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.222', 'grad_norm': '2.101', 'learning_rate': '4.967e-05', 'epoch': '0.3529', 'num_input_tokens_seen': 28690752, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3649', 'grad_norm': '1.061', 'learning_rate': '4.967e-05', 'epoch': '0.3529', 'num_input_tokens_seen': 28692799, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7581', 'grad_norm': '1.487', 'learning_rate': '4.967e-05', 'epoch': '0.3529', 'num_input_tokens_seen': 28694846, 'train_runtime': '1.452e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.627', 'grad_norm': '2.561', 'learning_rate': '4.967e-05', 'epoch': '0.353', 'num_input_tokens_seen': 28696893, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7278', 'grad_norm': '1.246', 'learning_rate': '4.967e-05', 'epoch': '0.353', 'num_input_tokens_seen': 28698940, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8229', 'grad_norm': '1.232', 'learning_rate': '4.967e-05', 'epoch': '0.353', 'num_input_tokens_seen': 28700987, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2688', 'grad_norm': '0.8996', 'learning_rate': '4.967e-05', 'epoch': '0.353', 'num_input_tokens_seen': 28703034, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2913', 'grad_norm': '0.8996', 'learning_rate': '4.967e-05', 'epoch': '0.3531', 'num_input_tokens_seen': 28705081, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4467', 'grad_norm': '1.022', 'learning_rate': '4.967e-05', 'epoch': '0.3531', 'num_input_tokens_seen': 28707128, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4692', 'grad_norm': '1.437', 'learning_rate': '4.967e-05', 'epoch': '0.3531', 'num_input_tokens_seen': 28709175, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3541', 'grad_norm': '0.7873', 'learning_rate': '4.967e-05', 'epoch': '0.3531', 'num_input_tokens_seen': 28711222, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.481', 'grad_norm': '2.496', 'learning_rate': '4.967e-05', 'epoch': '0.3532', 'num_input_tokens_seen': 28713269, 'train_runtime': '1.453e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9672', 'grad_norm': '1.717', 'learning_rate': '4.967e-05', 'epoch': '0.3532', 'num_input_tokens_seen': 28715316, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9794', 'grad_norm': '2.006', 'learning_rate': '4.967e-05', 'epoch': '0.3532', 'num_input_tokens_seen': 28717363, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3367', 'grad_norm': '0.8214', 'learning_rate': '4.967e-05', 'epoch': '0.3532', 'num_input_tokens_seen': 28719410, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4433', 'grad_norm': '1.124', 'learning_rate': '4.967e-05', 'epoch': '0.3533', 'num_input_tokens_seen': 28721457, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4784', 'grad_norm': '1.259', 'learning_rate': '4.967e-05', 'epoch': '0.3533', 'num_input_tokens_seen': 28723504, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5462', 'grad_norm': '1.448', 'learning_rate': '4.967e-05', 'epoch': '0.3533', 'num_input_tokens_seen': 28725551, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6861', 'grad_norm': '1.287', 'learning_rate': '4.967e-05', 'epoch': '0.3533', 'num_input_tokens_seen': 28727598, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.786', 'grad_norm': '1.816', 'learning_rate': '4.967e-05', 'epoch': '0.3534', 'num_input_tokens_seen': 28729645, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.571', 'grad_norm': '2.26', 'learning_rate': '4.967e-05', 'epoch': '0.3534', 'num_input_tokens_seen': 28731692, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6638', 'grad_norm': '1.926', 'learning_rate': '4.967e-05', 'epoch': '0.3534', 'num_input_tokens_seen': 28733739, 'train_runtime': '1.454e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5696', 'grad_norm': '1.179', 'learning_rate': '4.967e-05', 'epoch': '0.3535', 'num_input_tokens_seen': 28735786, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.011', 'grad_norm': '2.078', 'learning_rate': '4.967e-05', 'epoch': '0.3535', 'num_input_tokens_seen': 28737833, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6875', 'grad_norm': '1.344', 'learning_rate': '4.967e-05', 'epoch': '0.3535', 'num_input_tokens_seen': 28739880, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.302', 'grad_norm': '1.843', 'learning_rate': '4.967e-05', 'epoch': '0.3535', 'num_input_tokens_seen': 28741927, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5685', 'grad_norm': '1.507', 'learning_rate': '4.967e-05', 'epoch': '0.3536', 'num_input_tokens_seen': 28743974, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.618', 'grad_norm': '1.45', 'learning_rate': '4.967e-05', 'epoch': '0.3536', 'num_input_tokens_seen': 28746021, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4575', 'grad_norm': '1.362', 'learning_rate': '4.967e-05', 'epoch': '0.3536', 'num_input_tokens_seen': 28748068, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2011', 'grad_norm': '0.8298', 'learning_rate': '4.967e-05', 'epoch': '0.3536', 'num_input_tokens_seen': 28750115, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.14', 'grad_norm': '1.807', 'learning_rate': '4.967e-05', 'epoch': '0.3537', 'num_input_tokens_seen': 28752162, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2409', 'grad_norm': '1.105', 'learning_rate': '4.967e-05', 'epoch': '0.3537', 'num_input_tokens_seen': 28754209, 'train_runtime': '1.455e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9404', 'grad_norm': '1.223', 'learning_rate': '4.967e-05', 'epoch': '0.3537', 'num_input_tokens_seen': 28756256, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6639', 'grad_norm': '1.486', 'learning_rate': '4.967e-05', 'epoch': '0.3537', 'num_input_tokens_seen': 28758303, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6005', 'grad_norm': '1.083', 'learning_rate': '4.967e-05', 'epoch': '0.3538', 'num_input_tokens_seen': 28760350, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4669', 'grad_norm': '1.037', 'learning_rate': '4.967e-05', 'epoch': '0.3538', 'num_input_tokens_seen': 28762397, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8252', 'grad_norm': '1.479', 'learning_rate': '4.967e-05', 'epoch': '0.3538', 'num_input_tokens_seen': 28764444, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4034', 'grad_norm': '1.078', 'learning_rate': '4.967e-05', 'epoch': '0.3538', 'num_input_tokens_seen': 28766491, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.099', 'grad_norm': '1.935', 'learning_rate': '4.967e-05', 'epoch': '0.3539', 'num_input_tokens_seen': 28768538, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.345', 'grad_norm': '1.034', 'learning_rate': '4.967e-05', 'epoch': '0.3539', 'num_input_tokens_seen': 28770585, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.643', 'grad_norm': '1.412', 'learning_rate': '4.967e-05', 'epoch': '0.3539', 'num_input_tokens_seen': 28772632, 'train_runtime': '1.456e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.228', 'grad_norm': '2.407', 'learning_rate': '4.967e-05', 'epoch': '0.3539', 'num_input_tokens_seen': 28774679, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5522', 'grad_norm': '1.111', 'learning_rate': '4.967e-05', 'epoch': '0.354', 'num_input_tokens_seen': 28776726, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.852', 'grad_norm': '1.649', 'learning_rate': '4.967e-05', 'epoch': '0.354', 'num_input_tokens_seen': 28778773, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2412', 'grad_norm': '1.029', 'learning_rate': '4.967e-05', 'epoch': '0.354', 'num_input_tokens_seen': 28780820, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3365', 'grad_norm': '0.8024', 'learning_rate': '4.967e-05', 'epoch': '0.354', 'num_input_tokens_seen': 28782867, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3056', 'grad_norm': '0.9528', 'learning_rate': '4.967e-05', 'epoch': '0.3541', 'num_input_tokens_seen': 28784914, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7709', 'grad_norm': '1.519', 'learning_rate': '4.967e-05', 'epoch': '0.3541', 'num_input_tokens_seen': 28786961, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.267', 'grad_norm': '2.307', 'learning_rate': '4.967e-05', 'epoch': '0.3541', 'num_input_tokens_seen': 28789008, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6781', 'grad_norm': '1.362', 'learning_rate': '4.967e-05', 'epoch': '0.3541', 'num_input_tokens_seen': 28791055, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3247', 'grad_norm': '0.9563', 'learning_rate': '4.967e-05', 'epoch': '0.3542', 'num_input_tokens_seen': 28793102, 'train_runtime': '1.457e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.293', 'grad_norm': '2.282', 'learning_rate': '4.967e-05', 'epoch': '0.3542', 'num_input_tokens_seen': 28795149, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6793', 'grad_norm': '1.366', 'learning_rate': '4.967e-05', 'epoch': '0.3542', 'num_input_tokens_seen': 28797196, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9935', 'grad_norm': '2.087', 'learning_rate': '4.967e-05', 'epoch': '0.3542', 'num_input_tokens_seen': 28799243, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5713', 'grad_norm': '0.9848', 'learning_rate': '4.967e-05', 'epoch': '0.3543', 'num_input_tokens_seen': 28801290, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4851', 'grad_norm': '0.9825', 'learning_rate': '4.967e-05', 'epoch': '0.3543', 'num_input_tokens_seen': 28803337, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5469', 'grad_norm': '1.444', 'learning_rate': '4.967e-05', 'epoch': '0.3543', 'num_input_tokens_seen': 28805384, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3181', 'grad_norm': '0.8404', 'learning_rate': '4.967e-05', 'epoch': '0.3543', 'num_input_tokens_seen': 28807431, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4457', 'grad_norm': '1.247', 'learning_rate': '4.967e-05', 'epoch': '0.3544', 'num_input_tokens_seen': 28809478, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9093', 'grad_norm': '1.57', 'learning_rate': '4.967e-05', 'epoch': '0.3544', 'num_input_tokens_seen': 28811525, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8573', 'grad_norm': '1.232', 'learning_rate': '4.967e-05', 'epoch': '0.3544', 'num_input_tokens_seen': 28813572, 'train_runtime': '1.458e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6411', 'grad_norm': '1.796', 'learning_rate': '4.967e-05', 'epoch': '0.3544', 'num_input_tokens_seen': 28815619, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.175', 'grad_norm': '2.732', 'learning_rate': '4.967e-05', 'epoch': '0.3545', 'num_input_tokens_seen': 28817666, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5634', 'grad_norm': '1.202', 'learning_rate': '4.967e-05', 'epoch': '0.3545', 'num_input_tokens_seen': 28819713, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7697', 'grad_norm': '1.683', 'learning_rate': '4.967e-05', 'epoch': '0.3545', 'num_input_tokens_seen': 28821760, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7406', 'grad_norm': '1.558', 'learning_rate': '4.967e-05', 'epoch': '0.3545', 'num_input_tokens_seen': 28823807, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1947', 'grad_norm': '0.82', 'learning_rate': '4.967e-05', 'epoch': '0.3546', 'num_input_tokens_seen': 28825854, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.58', 'grad_norm': '2.734', 'learning_rate': '4.967e-05', 'epoch': '0.3546', 'num_input_tokens_seen': 28827901, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3787', 'grad_norm': '0.936', 'learning_rate': '4.967e-05', 'epoch': '0.3546', 'num_input_tokens_seen': 28829948, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.374', 'grad_norm': '2.328', 'learning_rate': '4.967e-05', 'epoch': '0.3546', 'num_input_tokens_seen': 28831995, 'train_runtime': '1.459e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3405', 'grad_norm': '1.31', 'learning_rate': '4.967e-05', 'epoch': '0.3547', 'num_input_tokens_seen': 28834042, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.222', 'grad_norm': '2.113', 'learning_rate': '4.967e-05', 'epoch': '0.3547', 'num_input_tokens_seen': 28836089, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2152', 'grad_norm': '0.9168', 'learning_rate': '4.967e-05', 'epoch': '0.3547', 'num_input_tokens_seen': 28838136, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3641', 'grad_norm': '0.9313', 'learning_rate': '4.967e-05', 'epoch': '0.3547', 'num_input_tokens_seen': 28840183, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7537', 'grad_norm': '1.235', 'learning_rate': '4.967e-05', 'epoch': '0.3548', 'num_input_tokens_seen': 28842230, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4939', 'grad_norm': '1.058', 'learning_rate': '4.967e-05', 'epoch': '0.3548', 'num_input_tokens_seen': 28844277, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1974', 'grad_norm': '0.9013', 'learning_rate': '4.967e-05', 'epoch': '0.3548', 'num_input_tokens_seen': 28846324, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8618', 'grad_norm': '1.294', 'learning_rate': '4.967e-05', 'epoch': '0.3548', 'num_input_tokens_seen': 28848371, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.139', 'grad_norm': '2.33', 'learning_rate': '4.967e-05', 'epoch': '0.3549', 'num_input_tokens_seen': 28850418, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3554', 'grad_norm': '0.9975', 'learning_rate': '4.967e-05', 'epoch': '0.3549', 'num_input_tokens_seen': 28852465, 'train_runtime': '1.46e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1784', 'grad_norm': '0.7876', 'learning_rate': '4.967e-05', 'epoch': '0.3549', 'num_input_tokens_seen': 28854512, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4741', 'grad_norm': '1.353', 'learning_rate': '4.967e-05', 'epoch': '0.3549', 'num_input_tokens_seen': 28856559, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5266', 'grad_norm': '1.152', 'learning_rate': '4.967e-05', 'epoch': '0.355', 'num_input_tokens_seen': 28858606, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9099', 'grad_norm': '1.89', 'learning_rate': '4.967e-05', 'epoch': '0.355', 'num_input_tokens_seen': 28860653, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3113', 'grad_norm': '0.8723', 'learning_rate': '4.967e-05', 'epoch': '0.355', 'num_input_tokens_seen': 28862700, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2497', 'grad_norm': '0.7445', 'learning_rate': '4.967e-05', 'epoch': '0.355', 'num_input_tokens_seen': 28864747, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9072', 'grad_norm': '1.975', 'learning_rate': '4.967e-05', 'epoch': '0.3551', 'num_input_tokens_seen': 28866794, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7976', 'grad_norm': '1.422', 'learning_rate': '4.967e-05', 'epoch': '0.3551', 'num_input_tokens_seen': 28868841, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5963', 'grad_norm': '1.09', 'learning_rate': '4.967e-05', 'epoch': '0.3551', 'num_input_tokens_seen': 28870888, 'train_runtime': '1.461e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7151', 'grad_norm': '1.715', 'learning_rate': '4.967e-05', 'epoch': '0.3551', 'num_input_tokens_seen': 28872935, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6509', 'grad_norm': '1.154', 'learning_rate': '4.967e-05', 'epoch': '0.3552', 'num_input_tokens_seen': 28874982, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.123', 'grad_norm': '1.418', 'learning_rate': '4.967e-05', 'epoch': '0.3552', 'num_input_tokens_seen': 28877029, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.321', 'grad_norm': '0.889', 'learning_rate': '4.967e-05', 'epoch': '0.3552', 'num_input_tokens_seen': 28879076, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3227', 'grad_norm': '0.8497', 'learning_rate': '4.967e-05', 'epoch': '0.3552', 'num_input_tokens_seen': 28881123, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5792', 'grad_norm': '1.164', 'learning_rate': '4.967e-05', 'epoch': '0.3553', 'num_input_tokens_seen': 28883170, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4507', 'grad_norm': '0.9636', 'learning_rate': '4.967e-05', 'epoch': '0.3553', 'num_input_tokens_seen': 28885217, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3692', 'grad_norm': '0.8711', 'learning_rate': '4.967e-05', 'epoch': '0.3553', 'num_input_tokens_seen': 28887264, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4765', 'grad_norm': '1.109', 'learning_rate': '4.967e-05', 'epoch': '0.3553', 'num_input_tokens_seen': 28889311, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.683', 'grad_norm': '2.578', 'learning_rate': '4.967e-05', 'epoch': '0.3554', 'num_input_tokens_seen': 28891358, 'train_runtime': '1.462e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.556', 'grad_norm': '2.494', 'learning_rate': '4.967e-05', 'epoch': '0.3554', 'num_input_tokens_seen': 28893405, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3489', 'grad_norm': '0.9433', 'learning_rate': '4.967e-05', 'epoch': '0.3554', 'num_input_tokens_seen': 28895452, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6257', 'grad_norm': '1.142', 'learning_rate': '4.967e-05', 'epoch': '0.3554', 'num_input_tokens_seen': 28897499, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7363', 'grad_norm': '1.647', 'learning_rate': '4.967e-05', 'epoch': '0.3555', 'num_input_tokens_seen': 28899546, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3866', 'grad_norm': '1.131', 'learning_rate': '4.967e-05', 'epoch': '0.3555', 'num_input_tokens_seen': 28901593, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5149', 'grad_norm': '1.215', 'learning_rate': '4.967e-05', 'epoch': '0.3555', 'num_input_tokens_seen': 28903640, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4377', 'grad_norm': '0.9063', 'learning_rate': '4.967e-05', 'epoch': '0.3555', 'num_input_tokens_seen': 28905687, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6098', 'grad_norm': '1.123', 'learning_rate': '4.967e-05', 'epoch': '0.3556', 'num_input_tokens_seen': 28907734, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7496', 'grad_norm': '1.388', 'learning_rate': '4.967e-05', 'epoch': '0.3556', 'num_input_tokens_seen': 28909781, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.341', 'grad_norm': '0.9183', 'learning_rate': '4.967e-05', 'epoch': '0.3556', 'num_input_tokens_seen': 28911828, 'train_runtime': '1.463e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3474', 'grad_norm': '0.8236', 'learning_rate': '4.967e-05', 'epoch': '0.3556', 'num_input_tokens_seen': 28913875, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3816', 'grad_norm': '0.9936', 'learning_rate': '4.966e-05', 'epoch': '0.3557', 'num_input_tokens_seen': 28915922, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.798', 'grad_norm': '2.663', 'learning_rate': '4.966e-05', 'epoch': '0.3557', 'num_input_tokens_seen': 28917969, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.831', 'grad_norm': '2.723', 'learning_rate': '4.966e-05', 'epoch': '0.3557', 'num_input_tokens_seen': 28920016, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2708', 'grad_norm': '0.8064', 'learning_rate': '4.966e-05', 'epoch': '0.3557', 'num_input_tokens_seen': 28922063, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1895', 'grad_norm': '0.7775', 'learning_rate': '4.966e-05', 'epoch': '0.3558', 'num_input_tokens_seen': 28924110, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3017', 'grad_norm': '0.7386', 'learning_rate': '4.966e-05', 'epoch': '0.3558', 'num_input_tokens_seen': 28926157, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6968', 'grad_norm': '1.089', 'learning_rate': '4.966e-05', 'epoch': '0.3558', 'num_input_tokens_seen': 28928204, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3443', 'grad_norm': '0.9318', 'learning_rate': '4.966e-05', 'epoch': '0.3558', 'num_input_tokens_seen': 28930251, 'train_runtime': '1.464e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3888', 'grad_norm': '0.9146', 'learning_rate': '4.966e-05', 'epoch': '0.3559', 'num_input_tokens_seen': 28932298, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8978', 'grad_norm': '1.661', 'learning_rate': '4.966e-05', 'epoch': '0.3559', 'num_input_tokens_seen': 28934345, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.184', 'grad_norm': '0.7669', 'learning_rate': '4.966e-05', 'epoch': '0.3559', 'num_input_tokens_seen': 28936392, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1694', 'grad_norm': '0.6762', 'learning_rate': '4.966e-05', 'epoch': '0.3559', 'num_input_tokens_seen': 28938439, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6931', 'grad_norm': '1.191', 'learning_rate': '4.966e-05', 'epoch': '0.356', 'num_input_tokens_seen': 28940486, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.18', 'grad_norm': '2.148', 'learning_rate': '4.966e-05', 'epoch': '0.356', 'num_input_tokens_seen': 28942533, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2607', 'grad_norm': '0.7927', 'learning_rate': '4.966e-05', 'epoch': '0.356', 'num_input_tokens_seen': 28944580, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8503', 'grad_norm': '1.419', 'learning_rate': '4.966e-05', 'epoch': '0.356', 'num_input_tokens_seen': 28946627, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.802', 'grad_norm': '1.382', 'learning_rate': '4.966e-05', 'epoch': '0.3561', 'num_input_tokens_seen': 28948674, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.448', 'grad_norm': '0.9507', 'learning_rate': '4.966e-05', 'epoch': '0.3561', 'num_input_tokens_seen': 28950721, 'train_runtime': '1.465e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.259', 'grad_norm': '2.352', 'learning_rate': '4.966e-05', 'epoch': '0.3561', 'num_input_tokens_seen': 28952768, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5077', 'grad_norm': '1.222', 'learning_rate': '4.966e-05', 'epoch': '0.3561', 'num_input_tokens_seen': 28954815, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.355', 'grad_norm': '3.293', 'learning_rate': '4.966e-05', 'epoch': '0.3562', 'num_input_tokens_seen': 28956862, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.499', 'grad_norm': '1.229', 'learning_rate': '4.966e-05', 'epoch': '0.3562', 'num_input_tokens_seen': 28958909, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3152', 'grad_norm': '0.9491', 'learning_rate': '4.966e-05', 'epoch': '0.3562', 'num_input_tokens_seen': 28960956, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7586', 'grad_norm': '1.375', 'learning_rate': '4.966e-05', 'epoch': '0.3562', 'num_input_tokens_seen': 28963003, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9376', 'grad_norm': '1.76', 'learning_rate': '4.966e-05', 'epoch': '0.3563', 'num_input_tokens_seen': 28965050, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5613', 'grad_norm': '1.358', 'learning_rate': '4.966e-05', 'epoch': '0.3563', 'num_input_tokens_seen': 28967097, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8208', 'grad_norm': '1.262', 'learning_rate': '4.966e-05', 'epoch': '0.3563', 'num_input_tokens_seen': 28969144, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.166', 'grad_norm': '2.405', 'learning_rate': '4.966e-05', 'epoch': '0.3563', 'num_input_tokens_seen': 28971191, 'train_runtime': '1.466e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6966', 'grad_norm': '1.236', 'learning_rate': '4.966e-05', 'epoch': '0.3564', 'num_input_tokens_seen': 28973238, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5462', 'grad_norm': '1.058', 'learning_rate': '4.966e-05', 'epoch': '0.3564', 'num_input_tokens_seen': 28975285, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2555', 'grad_norm': '0.8449', 'learning_rate': '4.966e-05', 'epoch': '0.3564', 'num_input_tokens_seen': 28977332, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.167', 'grad_norm': '2.239', 'learning_rate': '4.966e-05', 'epoch': '0.3564', 'num_input_tokens_seen': 28979379, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4173', 'grad_norm': '0.9994', 'learning_rate': '4.966e-05', 'epoch': '0.3565', 'num_input_tokens_seen': 28981426, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.504', 'grad_norm': '2.222', 'learning_rate': '4.966e-05', 'epoch': '0.3565', 'num_input_tokens_seen': 28983473, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.28', 'grad_norm': '2.842', 'learning_rate': '4.966e-05', 'epoch': '0.3565', 'num_input_tokens_seen': 28985520, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5642', 'grad_norm': '1.569', 'learning_rate': '4.966e-05', 'epoch': '0.3565', 'num_input_tokens_seen': 28987567, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8867', 'grad_norm': '1.361', 'learning_rate': '4.966e-05', 'epoch': '0.3566', 'num_input_tokens_seen': 28989614, 'train_runtime': '1.467e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8468', 'grad_norm': '2.589', 'learning_rate': '4.966e-05', 'epoch': '0.3566', 'num_input_tokens_seen': 28991661, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3546', 'grad_norm': '1.067', 'learning_rate': '4.966e-05', 'epoch': '0.3566', 'num_input_tokens_seen': 28993708, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.346', 'grad_norm': '0.8278', 'learning_rate': '4.966e-05', 'epoch': '0.3566', 'num_input_tokens_seen': 28995755, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9991', 'grad_norm': '1.398', 'learning_rate': '4.966e-05', 'epoch': '0.3567', 'num_input_tokens_seen': 28997802, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4586', 'grad_norm': '1.079', 'learning_rate': '4.966e-05', 'epoch': '0.3567', 'num_input_tokens_seen': 28999849, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7287', 'grad_norm': '1.505', 'learning_rate': '4.966e-05', 'epoch': '0.3567', 'num_input_tokens_seen': 29001896, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5205', 'grad_norm': '1.075', 'learning_rate': '4.966e-05', 'epoch': '0.3567', 'num_input_tokens_seen': 29003943, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7731', 'grad_norm': '1.125', 'learning_rate': '4.966e-05', 'epoch': '0.3568', 'num_input_tokens_seen': 29005990, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.089', 'grad_norm': '2.017', 'learning_rate': '4.966e-05', 'epoch': '0.3568', 'num_input_tokens_seen': 29008037, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5823', 'grad_norm': '0.8942', 'learning_rate': '4.966e-05', 'epoch': '0.3568', 'num_input_tokens_seen': 29010084, 'train_runtime': '1.468e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4129', 'grad_norm': '0.9027', 'learning_rate': '4.966e-05', 'epoch': '0.3568', 'num_input_tokens_seen': 29012131, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3535', 'grad_norm': '1.031', 'learning_rate': '4.966e-05', 'epoch': '0.3569', 'num_input_tokens_seen': 29014178, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8126', 'grad_norm': '1.322', 'learning_rate': '4.966e-05', 'epoch': '0.3569', 'num_input_tokens_seen': 29016225, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6527', 'grad_norm': '1.057', 'learning_rate': '4.966e-05', 'epoch': '0.3569', 'num_input_tokens_seen': 29018272, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3534', 'grad_norm': '0.973', 'learning_rate': '4.966e-05', 'epoch': '0.357', 'num_input_tokens_seen': 29020319, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6065', 'grad_norm': '1.4', 'learning_rate': '4.966e-05', 'epoch': '0.357', 'num_input_tokens_seen': 29022366, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4549', 'grad_norm': '1.03', 'learning_rate': '4.966e-05', 'epoch': '0.357', 'num_input_tokens_seen': 29024413, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2945', 'grad_norm': '0.7912', 'learning_rate': '4.966e-05', 'epoch': '0.357', 'num_input_tokens_seen': 29026460, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6438', 'grad_norm': '1.231', 'learning_rate': '4.966e-05', 'epoch': '0.3571', 'num_input_tokens_seen': 29028507, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6539', 'grad_norm': '1.106', 'learning_rate': '4.966e-05', 'epoch': '0.3571', 'num_input_tokens_seen': 29030554, 'train_runtime': '1.469e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.732', 'grad_norm': '1.26', 'learning_rate': '4.966e-05', 'epoch': '0.3571', 'num_input_tokens_seen': 29032601, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4788', 'grad_norm': '1.594', 'learning_rate': '4.966e-05', 'epoch': '0.3571', 'num_input_tokens_seen': 29034648, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6232', 'grad_norm': '1.371', 'learning_rate': '4.966e-05', 'epoch': '0.3572', 'num_input_tokens_seen': 29036695, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.485', 'grad_norm': '2.314', 'learning_rate': '4.966e-05', 'epoch': '0.3572', 'num_input_tokens_seen': 29038742, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2305', 'grad_norm': '0.9167', 'learning_rate': '4.966e-05', 'epoch': '0.3572', 'num_input_tokens_seen': 29040789, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7379', 'grad_norm': '1.639', 'learning_rate': '4.966e-05', 'epoch': '0.3572', 'num_input_tokens_seen': 29042836, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3531', 'grad_norm': '0.7886', 'learning_rate': '4.966e-05', 'epoch': '0.3573', 'num_input_tokens_seen': 29044883, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.492', 'grad_norm': '3.114', 'learning_rate': '4.966e-05', 'epoch': '0.3573', 'num_input_tokens_seen': 29046930, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4055', 'grad_norm': '1.148', 'learning_rate': '4.966e-05', 'epoch': '0.3573', 'num_input_tokens_seen': 29048977, 'train_runtime': '1.47e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.382', 'grad_norm': '2.224', 'learning_rate': '4.966e-05', 'epoch': '0.3573', 'num_input_tokens_seen': 29051024, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3257', 'grad_norm': '0.8589', 'learning_rate': '4.966e-05', 'epoch': '0.3574', 'num_input_tokens_seen': 29053071, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4768', 'grad_norm': '1.021', 'learning_rate': '4.966e-05', 'epoch': '0.3574', 'num_input_tokens_seen': 29055118, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2745', 'grad_norm': '0.9687', 'learning_rate': '4.966e-05', 'epoch': '0.3574', 'num_input_tokens_seen': 29057165, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8163', 'grad_norm': '1.37', 'learning_rate': '4.966e-05', 'epoch': '0.3574', 'num_input_tokens_seen': 29059212, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7226', 'grad_norm': '1.745', 'learning_rate': '4.966e-05', 'epoch': '0.3575', 'num_input_tokens_seen': 29061259, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.775', 'grad_norm': '1.461', 'learning_rate': '4.966e-05', 'epoch': '0.3575', 'num_input_tokens_seen': 29063306, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.89', 'grad_norm': '3.038', 'learning_rate': '4.966e-05', 'epoch': '0.3575', 'num_input_tokens_seen': 29065353, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.642', 'grad_norm': '1.086', 'learning_rate': '4.966e-05', 'epoch': '0.3575', 'num_input_tokens_seen': 29067400, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9296', 'grad_norm': '1.381', 'learning_rate': '4.966e-05', 'epoch': '0.3576', 'num_input_tokens_seen': 29069447, 'train_runtime': '1.471e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4402', 'grad_norm': '0.895', 'learning_rate': '4.966e-05', 'epoch': '0.3576', 'num_input_tokens_seen': 29071494, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2242', 'grad_norm': '0.8107', 'learning_rate': '4.966e-05', 'epoch': '0.3576', 'num_input_tokens_seen': 29073541, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8383', 'grad_norm': '1.717', 'learning_rate': '4.966e-05', 'epoch': '0.3576', 'num_input_tokens_seen': 29075588, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.675', 'grad_norm': '1.092', 'learning_rate': '4.966e-05', 'epoch': '0.3577', 'num_input_tokens_seen': 29077635, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4944', 'grad_norm': '1.163', 'learning_rate': '4.966e-05', 'epoch': '0.3577', 'num_input_tokens_seen': 29079682, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3523', 'grad_norm': '0.8416', 'learning_rate': '4.966e-05', 'epoch': '0.3577', 'num_input_tokens_seen': 29081729, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2785', 'grad_norm': '0.7434', 'learning_rate': '4.966e-05', 'epoch': '0.3577', 'num_input_tokens_seen': 29083776, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.027', 'grad_norm': '1.435', 'learning_rate': '4.966e-05', 'epoch': '0.3578', 'num_input_tokens_seen': 29085823, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5853', 'grad_norm': '1.362', 'learning_rate': '4.966e-05', 'epoch': '0.3578', 'num_input_tokens_seen': 29087870, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2857', 'grad_norm': '0.8803', 'learning_rate': '4.966e-05', 'epoch': '0.3578', 'num_input_tokens_seen': 29089917, 'train_runtime': '1.472e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.336', 'grad_norm': '2.211', 'learning_rate': '4.966e-05', 'epoch': '0.3578', 'num_input_tokens_seen': 29091964, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.152', 'grad_norm': '1.826', 'learning_rate': '4.966e-05', 'epoch': '0.3579', 'num_input_tokens_seen': 29094011, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.461', 'grad_norm': '1.716', 'learning_rate': '4.966e-05', 'epoch': '0.3579', 'num_input_tokens_seen': 29096058, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.54', 'grad_norm': '2.591', 'learning_rate': '4.966e-05', 'epoch': '0.3579', 'num_input_tokens_seen': 29098105, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.748', 'grad_norm': '2.419', 'learning_rate': '4.966e-05', 'epoch': '0.3579', 'num_input_tokens_seen': 29100152, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3169', 'grad_norm': '1.049', 'learning_rate': '4.966e-05', 'epoch': '0.358', 'num_input_tokens_seen': 29102199, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2317', 'grad_norm': '0.7126', 'learning_rate': '4.966e-05', 'epoch': '0.358', 'num_input_tokens_seen': 29104246, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3949', 'grad_norm': '1.186', 'learning_rate': '4.966e-05', 'epoch': '0.358', 'num_input_tokens_seen': 29106293, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5905', 'grad_norm': '1.243', 'learning_rate': '4.966e-05', 'epoch': '0.358', 'num_input_tokens_seen': 29108340, 'train_runtime': '1.473e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7015', 'grad_norm': '1.337', 'learning_rate': '4.966e-05', 'epoch': '0.3581', 'num_input_tokens_seen': 29110387, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.653', 'grad_norm': '1.347', 'learning_rate': '4.966e-05', 'epoch': '0.3581', 'num_input_tokens_seen': 29112434, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6841', 'grad_norm': '1.309', 'learning_rate': '4.966e-05', 'epoch': '0.3581', 'num_input_tokens_seen': 29114481, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1928', 'grad_norm': '0.9036', 'learning_rate': '4.966e-05', 'epoch': '0.3581', 'num_input_tokens_seen': 29116528, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3616', 'grad_norm': '0.8703', 'learning_rate': '4.966e-05', 'epoch': '0.3582', 'num_input_tokens_seen': 29118575, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.314', 'grad_norm': '0.9246', 'learning_rate': '4.966e-05', 'epoch': '0.3582', 'num_input_tokens_seen': 29120622, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8423', 'grad_norm': '1.154', 'learning_rate': '4.966e-05', 'epoch': '0.3582', 'num_input_tokens_seen': 29122669, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.775', 'grad_norm': '1.317', 'learning_rate': '4.966e-05', 'epoch': '0.3582', 'num_input_tokens_seen': 29124716, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1718', 'grad_norm': '0.8812', 'learning_rate': '4.966e-05', 'epoch': '0.3583', 'num_input_tokens_seen': 29126763, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8148', 'grad_norm': '1.749', 'learning_rate': '4.966e-05', 'epoch': '0.3583', 'num_input_tokens_seen': 29128810, 'train_runtime': '1.474e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9325', 'grad_norm': '1.593', 'learning_rate': '4.966e-05', 'epoch': '0.3583', 'num_input_tokens_seen': 29130857, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5255', 'grad_norm': '1.175', 'learning_rate': '4.966e-05', 'epoch': '0.3583', 'num_input_tokens_seen': 29132904, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.847', 'grad_norm': '2.185', 'learning_rate': '4.966e-05', 'epoch': '0.3584', 'num_input_tokens_seen': 29134951, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2458', 'grad_norm': '0.9002', 'learning_rate': '4.966e-05', 'epoch': '0.3584', 'num_input_tokens_seen': 29136998, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2605', 'grad_norm': '0.9827', 'learning_rate': '4.966e-05', 'epoch': '0.3584', 'num_input_tokens_seen': 29139045, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2668', 'grad_norm': '1.004', 'learning_rate': '4.966e-05', 'epoch': '0.3584', 'num_input_tokens_seen': 29141092, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6955', 'grad_norm': '1.358', 'learning_rate': '4.966e-05', 'epoch': '0.3585', 'num_input_tokens_seen': 29143139, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2594', 'grad_norm': '0.8689', 'learning_rate': '4.966e-05', 'epoch': '0.3585', 'num_input_tokens_seen': 29145186, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5277', 'grad_norm': '1.052', 'learning_rate': '4.966e-05', 'epoch': '0.3585', 'num_input_tokens_seen': 29147233, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6813', 'grad_norm': '0.879', 'learning_rate': '4.966e-05', 'epoch': '0.3585', 'num_input_tokens_seen': 29149280, 'train_runtime': '1.475e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.564', 'grad_norm': '1.855', 'learning_rate': '4.966e-05', 'epoch': '0.3586', 'num_input_tokens_seen': 29151327, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.914', 'grad_norm': '3.543', 'learning_rate': '4.966e-05', 'epoch': '0.3586', 'num_input_tokens_seen': 29153374, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5988', 'grad_norm': '1.442', 'learning_rate': '4.966e-05', 'epoch': '0.3586', 'num_input_tokens_seen': 29155421, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4623', 'grad_norm': '0.9882', 'learning_rate': '4.966e-05', 'epoch': '0.3586', 'num_input_tokens_seen': 29157468, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4664', 'grad_norm': '1.092', 'learning_rate': '4.966e-05', 'epoch': '0.3587', 'num_input_tokens_seen': 29159515, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2793', 'grad_norm': '0.8555', 'learning_rate': '4.966e-05', 'epoch': '0.3587', 'num_input_tokens_seen': 29161562, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6194', 'grad_norm': '1.395', 'learning_rate': '4.966e-05', 'epoch': '0.3587', 'num_input_tokens_seen': 29163609, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.192', 'grad_norm': '0.7617', 'learning_rate': '4.966e-05', 'epoch': '0.3587', 'num_input_tokens_seen': 29165656, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4532', 'grad_norm': '1.034', 'learning_rate': '4.966e-05', 'epoch': '0.3588', 'num_input_tokens_seen': 29167703, 'train_runtime': '1.476e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.307', 'grad_norm': '2.683', 'learning_rate': '4.966e-05', 'epoch': '0.3588', 'num_input_tokens_seen': 29169750, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5467', 'grad_norm': '1.173', 'learning_rate': '4.966e-05', 'epoch': '0.3588', 'num_input_tokens_seen': 29171797, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3543', 'grad_norm': '1.242', 'learning_rate': '4.966e-05', 'epoch': '0.3588', 'num_input_tokens_seen': 29173844, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8277', 'grad_norm': '1.418', 'learning_rate': '4.966e-05', 'epoch': '0.3589', 'num_input_tokens_seen': 29175891, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7191', 'grad_norm': '1.318', 'learning_rate': '4.966e-05', 'epoch': '0.3589', 'num_input_tokens_seen': 29177938, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.342', 'grad_norm': '0.7078', 'learning_rate': '4.966e-05', 'epoch': '0.3589', 'num_input_tokens_seen': 29179985, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7163', 'grad_norm': '1.325', 'learning_rate': '4.966e-05', 'epoch': '0.3589', 'num_input_tokens_seen': 29182032, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6895', 'grad_norm': '0.9456', 'learning_rate': '4.966e-05', 'epoch': '0.359', 'num_input_tokens_seen': 29184079, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5951', 'grad_norm': '1.031', 'learning_rate': '4.966e-05', 'epoch': '0.359', 'num_input_tokens_seen': 29186126, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.338', 'grad_norm': '1.009', 'learning_rate': '4.966e-05', 'epoch': '0.359', 'num_input_tokens_seen': 29188173, 'train_runtime': '1.477e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.166', 'grad_norm': '2.097', 'learning_rate': '4.966e-05', 'epoch': '0.359', 'num_input_tokens_seen': 29190220, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8324', 'grad_norm': '1.275', 'learning_rate': '4.966e-05', 'epoch': '0.3591', 'num_input_tokens_seen': 29192267, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3111', 'grad_norm': '0.8569', 'learning_rate': '4.966e-05', 'epoch': '0.3591', 'num_input_tokens_seen': 29194314, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9828', 'grad_norm': '1.362', 'learning_rate': '4.966e-05', 'epoch': '0.3591', 'num_input_tokens_seen': 29196361, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4805', 'grad_norm': '1.132', 'learning_rate': '4.966e-05', 'epoch': '0.3591', 'num_input_tokens_seen': 29198408, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2309', 'grad_norm': '0.732', 'learning_rate': '4.966e-05', 'epoch': '0.3592', 'num_input_tokens_seen': 29200455, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5741', 'grad_norm': '1.32', 'learning_rate': '4.966e-05', 'epoch': '0.3592', 'num_input_tokens_seen': 29202502, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8993', 'grad_norm': '1.392', 'learning_rate': '4.966e-05', 'epoch': '0.3592', 'num_input_tokens_seen': 29204549, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5101', 'grad_norm': '1.078', 'learning_rate': '4.966e-05', 'epoch': '0.3592', 'num_input_tokens_seen': 29206596, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.486', 'grad_norm': '2.921', 'learning_rate': '4.966e-05', 'epoch': '0.3593', 'num_input_tokens_seen': 29208643, 'train_runtime': '1.478e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7454', 'grad_norm': '1.418', 'learning_rate': '4.966e-05', 'epoch': '0.3593', 'num_input_tokens_seen': 29210690, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3117', 'grad_norm': '1.072', 'learning_rate': '4.966e-05', 'epoch': '0.3593', 'num_input_tokens_seen': 29212737, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6472', 'grad_norm': '1.846', 'learning_rate': '4.966e-05', 'epoch': '0.3593', 'num_input_tokens_seen': 29214784, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2546', 'grad_norm': '0.8033', 'learning_rate': '4.966e-05', 'epoch': '0.3594', 'num_input_tokens_seen': 29216831, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3602', 'grad_norm': '0.8032', 'learning_rate': '4.966e-05', 'epoch': '0.3594', 'num_input_tokens_seen': 29218878, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6776', 'grad_norm': '1.135', 'learning_rate': '4.966e-05', 'epoch': '0.3594', 'num_input_tokens_seen': 29220925, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4313', 'grad_norm': '1.174', 'learning_rate': '4.966e-05', 'epoch': '0.3594', 'num_input_tokens_seen': 29222972, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5497', 'grad_norm': '1.568', 'learning_rate': '4.965e-05', 'epoch': '0.3595', 'num_input_tokens_seen': 29225019, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4438', 'grad_norm': '0.9302', 'learning_rate': '4.965e-05', 'epoch': '0.3595', 'num_input_tokens_seen': 29227066, 'train_runtime': '1.479e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.681', 'grad_norm': '2.723', 'learning_rate': '4.965e-05', 'epoch': '0.3595', 'num_input_tokens_seen': 29229113, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.317', 'grad_norm': '2.01', 'learning_rate': '4.965e-05', 'epoch': '0.3595', 'num_input_tokens_seen': 29231160, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3369', 'grad_norm': '0.8791', 'learning_rate': '4.965e-05', 'epoch': '0.3596', 'num_input_tokens_seen': 29233207, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9747', 'grad_norm': '1.684', 'learning_rate': '4.965e-05', 'epoch': '0.3596', 'num_input_tokens_seen': 29235254, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6473', 'grad_norm': '1.161', 'learning_rate': '4.965e-05', 'epoch': '0.3596', 'num_input_tokens_seen': 29237301, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2928', 'grad_norm': '0.7584', 'learning_rate': '4.965e-05', 'epoch': '0.3596', 'num_input_tokens_seen': 29239348, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.886', 'grad_norm': '1.519', 'learning_rate': '4.965e-05', 'epoch': '0.3597', 'num_input_tokens_seen': 29241395, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.158', 'grad_norm': '1.711', 'learning_rate': '4.965e-05', 'epoch': '0.3597', 'num_input_tokens_seen': 29243442, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8088', 'grad_norm': '1.469', 'learning_rate': '4.965e-05', 'epoch': '0.3597', 'num_input_tokens_seen': 29245489, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7666', 'grad_norm': '1.373', 'learning_rate': '4.965e-05', 'epoch': '0.3597', 'num_input_tokens_seen': 29247536, 'train_runtime': '1.48e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.899', 'grad_norm': '2.452', 'learning_rate': '4.965e-05', 'epoch': '0.3598', 'num_input_tokens_seen': 29249583, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3658', 'grad_norm': '1.152', 'learning_rate': '4.965e-05', 'epoch': '0.3598', 'num_input_tokens_seen': 29251630, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5022', 'grad_norm': '1.341', 'learning_rate': '4.965e-05', 'epoch': '0.3598', 'num_input_tokens_seen': 29253677, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.132', 'grad_norm': '2.171', 'learning_rate': '4.965e-05', 'epoch': '0.3598', 'num_input_tokens_seen': 29255724, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6982', 'grad_norm': '1.31', 'learning_rate': '4.965e-05', 'epoch': '0.3599', 'num_input_tokens_seen': 29257771, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3121', 'grad_norm': '1.139', 'learning_rate': '4.965e-05', 'epoch': '0.3599', 'num_input_tokens_seen': 29259818, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6424', 'grad_norm': '1.308', 'learning_rate': '4.965e-05', 'epoch': '0.3599', 'num_input_tokens_seen': 29261865, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3174', 'grad_norm': '0.906', 'learning_rate': '4.965e-05', 'epoch': '0.3599', 'num_input_tokens_seen': 29263912, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3939', 'grad_norm': '1.076', 'learning_rate': '4.965e-05', 'epoch': '0.36', 'num_input_tokens_seen': 29265959, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2593', 'grad_norm': '1.017', 'learning_rate': '4.965e-05', 'epoch': '0.36', 'num_input_tokens_seen': 29268006, 'train_runtime': '1.481e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3339', 'grad_norm': '0.9228', 'learning_rate': '4.965e-05', 'epoch': '0.36', 'num_input_tokens_seen': 29270053, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3152', 'grad_norm': '0.8656', 'learning_rate': '4.965e-05', 'epoch': '0.36', 'num_input_tokens_seen': 29272100, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5233', 'grad_norm': '1.228', 'learning_rate': '4.965e-05', 'epoch': '0.3601', 'num_input_tokens_seen': 29274147, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7275', 'grad_norm': '1.059', 'learning_rate': '4.965e-05', 'epoch': '0.3601', 'num_input_tokens_seen': 29276194, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2715', 'grad_norm': '0.9196', 'learning_rate': '4.965e-05', 'epoch': '0.3601', 'num_input_tokens_seen': 29278241, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2671', 'grad_norm': '0.7574', 'learning_rate': '4.965e-05', 'epoch': '0.3601', 'num_input_tokens_seen': 29280288, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.832', 'grad_norm': '1.641', 'learning_rate': '4.965e-05', 'epoch': '0.3602', 'num_input_tokens_seen': 29282335, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.726', 'grad_norm': '2.728', 'learning_rate': '4.965e-05', 'epoch': '0.3602', 'num_input_tokens_seen': 29284382, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3125', 'grad_norm': '0.9128', 'learning_rate': '4.965e-05', 'epoch': '0.3602', 'num_input_tokens_seen': 29286429, 'train_runtime': '1.482e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.369', 'grad_norm': '3.339', 'learning_rate': '4.965e-05', 'epoch': '0.3602', 'num_input_tokens_seen': 29288476, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2416', 'grad_norm': '0.7898', 'learning_rate': '4.965e-05', 'epoch': '0.3603', 'num_input_tokens_seen': 29290523, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3874', 'grad_norm': '1.028', 'learning_rate': '4.965e-05', 'epoch': '0.3603', 'num_input_tokens_seen': 29292570, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4555', 'grad_norm': '1.34', 'learning_rate': '4.965e-05', 'epoch': '0.3603', 'num_input_tokens_seen': 29294617, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2555', 'grad_norm': '0.9253', 'learning_rate': '4.965e-05', 'epoch': '0.3603', 'num_input_tokens_seen': 29296664, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5362', 'grad_norm': '1.83', 'learning_rate': '4.965e-05', 'epoch': '0.3604', 'num_input_tokens_seen': 29298711, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.919', 'grad_norm': '1.586', 'learning_rate': '4.965e-05', 'epoch': '0.3604', 'num_input_tokens_seen': 29300758, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6436', 'grad_norm': '1.24', 'learning_rate': '4.965e-05', 'epoch': '0.3604', 'num_input_tokens_seen': 29302805, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4238', 'grad_norm': '0.988', 'learning_rate': '4.965e-05', 'epoch': '0.3605', 'num_input_tokens_seen': 29304852, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6421', 'grad_norm': '1.463', 'learning_rate': '4.965e-05', 'epoch': '0.3605', 'num_input_tokens_seen': 29306899, 'train_runtime': '1.483e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3025', 'grad_norm': '0.9177', 'learning_rate': '4.965e-05', 'epoch': '0.3605', 'num_input_tokens_seen': 29308946, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8564', 'grad_norm': '1.86', 'learning_rate': '4.965e-05', 'epoch': '0.3605', 'num_input_tokens_seen': 29310993, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3237', 'grad_norm': '0.8177', 'learning_rate': '4.965e-05', 'epoch': '0.3606', 'num_input_tokens_seen': 29313040, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6023', 'grad_norm': '1.216', 'learning_rate': '4.965e-05', 'epoch': '0.3606', 'num_input_tokens_seen': 29315087, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2006', 'grad_norm': '0.7155', 'learning_rate': '4.965e-05', 'epoch': '0.3606', 'num_input_tokens_seen': 29317134, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.34', 'grad_norm': '0.9486', 'learning_rate': '4.965e-05', 'epoch': '0.3606', 'num_input_tokens_seen': 29319181, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3222', 'grad_norm': '0.9131', 'learning_rate': '4.965e-05', 'epoch': '0.3607', 'num_input_tokens_seen': 29321228, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5562', 'grad_norm': '1.085', 'learning_rate': '4.965e-05', 'epoch': '0.3607', 'num_input_tokens_seen': 29323275, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.06', 'grad_norm': '2.105', 'learning_rate': '4.965e-05', 'epoch': '0.3607', 'num_input_tokens_seen': 29325322, 'train_runtime': '1.484e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.785', 'grad_norm': '1.717', 'learning_rate': '4.965e-05', 'epoch': '0.3607', 'num_input_tokens_seen': 29327369, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3358', 'grad_norm': '0.843', 'learning_rate': '4.965e-05', 'epoch': '0.3608', 'num_input_tokens_seen': 29329416, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.56', 'grad_norm': '2.684', 'learning_rate': '4.965e-05', 'epoch': '0.3608', 'num_input_tokens_seen': 29331463, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2903', 'grad_norm': '0.8169', 'learning_rate': '4.965e-05', 'epoch': '0.3608', 'num_input_tokens_seen': 29333510, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.682', 'grad_norm': '3.024', 'learning_rate': '4.965e-05', 'epoch': '0.3608', 'num_input_tokens_seen': 29335557, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2658', 'grad_norm': '0.7581', 'learning_rate': '4.965e-05', 'epoch': '0.3609', 'num_input_tokens_seen': 29337604, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2154', 'grad_norm': '0.83', 'learning_rate': '4.965e-05', 'epoch': '0.3609', 'num_input_tokens_seen': 29339651, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1633', 'grad_norm': '0.7754', 'learning_rate': '4.965e-05', 'epoch': '0.3609', 'num_input_tokens_seen': 29341698, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3474', 'grad_norm': '0.8461', 'learning_rate': '4.965e-05', 'epoch': '0.3609', 'num_input_tokens_seen': 29343745, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5843', 'grad_norm': '1.275', 'learning_rate': '4.965e-05', 'epoch': '0.361', 'num_input_tokens_seen': 29345792, 'train_runtime': '1.485e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6266', 'grad_norm': '1.038', 'learning_rate': '4.965e-05', 'epoch': '0.361', 'num_input_tokens_seen': 29347839, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5071', 'grad_norm': '1.19', 'learning_rate': '4.965e-05', 'epoch': '0.361', 'num_input_tokens_seen': 29349886, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9735', 'grad_norm': '1.938', 'learning_rate': '4.965e-05', 'epoch': '0.361', 'num_input_tokens_seen': 29351933, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3563', 'grad_norm': '1.26', 'learning_rate': '4.965e-05', 'epoch': '0.3611', 'num_input_tokens_seen': 29353980, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2344', 'grad_norm': '0.8927', 'learning_rate': '4.965e-05', 'epoch': '0.3611', 'num_input_tokens_seen': 29356027, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.212', 'grad_norm': '0.9728', 'learning_rate': '4.965e-05', 'epoch': '0.3611', 'num_input_tokens_seen': 29358074, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.009', 'grad_norm': '1.858', 'learning_rate': '4.965e-05', 'epoch': '0.3611', 'num_input_tokens_seen': 29360121, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.263', 'grad_norm': '1.783', 'learning_rate': '4.965e-05', 'epoch': '0.3612', 'num_input_tokens_seen': 29362168, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.328', 'grad_norm': '0.9027', 'learning_rate': '4.965e-05', 'epoch': '0.3612', 'num_input_tokens_seen': 29364215, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.214', 'grad_norm': '2.174', 'learning_rate': '4.965e-05', 'epoch': '0.3612', 'num_input_tokens_seen': 29366262, 'train_runtime': '1.486e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2596', 'grad_norm': '0.8879', 'learning_rate': '4.965e-05', 'epoch': '0.3612', 'num_input_tokens_seen': 29368309, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.513', 'grad_norm': '1.053', 'learning_rate': '4.965e-05', 'epoch': '0.3613', 'num_input_tokens_seen': 29370356, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8643', 'grad_norm': '1.409', 'learning_rate': '4.965e-05', 'epoch': '0.3613', 'num_input_tokens_seen': 29372403, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.195', 'grad_norm': '1.831', 'learning_rate': '4.965e-05', 'epoch': '0.3613', 'num_input_tokens_seen': 29374450, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3278', 'grad_norm': '0.7036', 'learning_rate': '4.965e-05', 'epoch': '0.3613', 'num_input_tokens_seen': 29376497, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5786', 'grad_norm': '1.686', 'learning_rate': '4.965e-05', 'epoch': '0.3614', 'num_input_tokens_seen': 29378544, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2176', 'grad_norm': '0.9421', 'learning_rate': '4.965e-05', 'epoch': '0.3614', 'num_input_tokens_seen': 29380591, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.267', 'grad_norm': '1.873', 'learning_rate': '4.965e-05', 'epoch': '0.3614', 'num_input_tokens_seen': 29382638, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2109', 'grad_norm': '0.7229', 'learning_rate': '4.965e-05', 'epoch': '0.3614', 'num_input_tokens_seen': 29384685, 'train_runtime': '1.487e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2471', 'grad_norm': '0.8304', 'learning_rate': '4.965e-05', 'epoch': '0.3615', 'num_input_tokens_seen': 29386732, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.063', 'grad_norm': '2.554', 'learning_rate': '4.965e-05', 'epoch': '0.3615', 'num_input_tokens_seen': 29388779, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2659', 'grad_norm': '0.724', 'learning_rate': '4.965e-05', 'epoch': '0.3615', 'num_input_tokens_seen': 29390826, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.305', 'grad_norm': '2.661', 'learning_rate': '4.965e-05', 'epoch': '0.3615', 'num_input_tokens_seen': 29392873, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8203', 'grad_norm': '1.899', 'learning_rate': '4.965e-05', 'epoch': '0.3616', 'num_input_tokens_seen': 29394920, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4627', 'grad_norm': '1.223', 'learning_rate': '4.965e-05', 'epoch': '0.3616', 'num_input_tokens_seen': 29396967, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.033', 'grad_norm': '2.244', 'learning_rate': '4.965e-05', 'epoch': '0.3616', 'num_input_tokens_seen': 29399014, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2661', 'grad_norm': '0.846', 'learning_rate': '4.965e-05', 'epoch': '0.3616', 'num_input_tokens_seen': 29401061, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7193', 'grad_norm': '1.164', 'learning_rate': '4.965e-05', 'epoch': '0.3617', 'num_input_tokens_seen': 29403108, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1527', 'grad_norm': '0.8827', 'learning_rate': '4.965e-05', 'epoch': '0.3617', 'num_input_tokens_seen': 29405155, 'train_runtime': '1.488e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4788', 'grad_norm': '1.154', 'learning_rate': '4.965e-05', 'epoch': '0.3617', 'num_input_tokens_seen': 29407202, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2391', 'grad_norm': '0.8144', 'learning_rate': '4.965e-05', 'epoch': '0.3617', 'num_input_tokens_seen': 29409249, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6533', 'grad_norm': '1.216', 'learning_rate': '4.965e-05', 'epoch': '0.3618', 'num_input_tokens_seen': 29411296, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9225', 'grad_norm': '1.482', 'learning_rate': '4.965e-05', 'epoch': '0.3618', 'num_input_tokens_seen': 29413343, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2113', 'grad_norm': '0.8142', 'learning_rate': '4.965e-05', 'epoch': '0.3618', 'num_input_tokens_seen': 29415390, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6691', 'grad_norm': '1.174', 'learning_rate': '4.965e-05', 'epoch': '0.3618', 'num_input_tokens_seen': 29417437, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9506', 'grad_norm': '1.367', 'learning_rate': '4.965e-05', 'epoch': '0.3619', 'num_input_tokens_seen': 29419484, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2758', 'grad_norm': '1.205', 'learning_rate': '4.965e-05', 'epoch': '0.3619', 'num_input_tokens_seen': 29421531, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7598', 'grad_norm': '1.5', 'learning_rate': '4.965e-05', 'epoch': '0.3619', 'num_input_tokens_seen': 29423578, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2473', 'grad_norm': '0.8409', 'learning_rate': '4.965e-05', 'epoch': '0.3619', 'num_input_tokens_seen': 29425625, 'train_runtime': '1.489e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.227', 'grad_norm': '1.997', 'learning_rate': '4.965e-05', 'epoch': '0.362', 'num_input_tokens_seen': 29427672, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2098', 'grad_norm': '0.7667', 'learning_rate': '4.965e-05', 'epoch': '0.362', 'num_input_tokens_seen': 29429719, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1756', 'grad_norm': '0.7236', 'learning_rate': '4.965e-05', 'epoch': '0.362', 'num_input_tokens_seen': 29431766, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2699', 'grad_norm': '0.9288', 'learning_rate': '4.965e-05', 'epoch': '0.362', 'num_input_tokens_seen': 29433813, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4286', 'grad_norm': '0.8812', 'learning_rate': '4.965e-05', 'epoch': '0.3621', 'num_input_tokens_seen': 29435860, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6494', 'grad_norm': '1.125', 'learning_rate': '4.965e-05', 'epoch': '0.3621', 'num_input_tokens_seen': 29437907, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4481', 'grad_norm': '0.9096', 'learning_rate': '4.965e-05', 'epoch': '0.3621', 'num_input_tokens_seen': 29439954, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4124', 'grad_norm': '1.204', 'learning_rate': '4.965e-05', 'epoch': '0.3621', 'num_input_tokens_seen': 29442001, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8234', 'grad_norm': '1.358', 'learning_rate': '4.965e-05', 'epoch': '0.3622', 'num_input_tokens_seen': 29444048, 'train_runtime': '1.49e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4466', 'grad_norm': '1.086', 'learning_rate': '4.965e-05', 'epoch': '0.3622', 'num_input_tokens_seen': 29446095, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3743', 'grad_norm': '1.07', 'learning_rate': '4.965e-05', 'epoch': '0.3622', 'num_input_tokens_seen': 29448142, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.331', 'grad_norm': '0.9289', 'learning_rate': '4.965e-05', 'epoch': '0.3622', 'num_input_tokens_seen': 29450189, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3478', 'grad_norm': '0.9898', 'learning_rate': '4.965e-05', 'epoch': '0.3623', 'num_input_tokens_seen': 29452236, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9884', 'grad_norm': '1.705', 'learning_rate': '4.965e-05', 'epoch': '0.3623', 'num_input_tokens_seen': 29454283, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2261', 'grad_norm': '0.8238', 'learning_rate': '4.965e-05', 'epoch': '0.3623', 'num_input_tokens_seen': 29456330, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9993', 'grad_norm': '2.216', 'learning_rate': '4.965e-05', 'epoch': '0.3623', 'num_input_tokens_seen': 29458377, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5694', 'grad_norm': '1.301', 'learning_rate': '4.965e-05', 'epoch': '0.3624', 'num_input_tokens_seen': 29460424, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8813', 'grad_norm': '1.293', 'learning_rate': '4.965e-05', 'epoch': '0.3624', 'num_input_tokens_seen': 29462471, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2106', 'grad_norm': '0.8578', 'learning_rate': '4.965e-05', 'epoch': '0.3624', 'num_input_tokens_seen': 29464518, 'train_runtime': '1.491e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9548', 'grad_norm': '1.793', 'learning_rate': '4.965e-05', 'epoch': '0.3624', 'num_input_tokens_seen': 29466565, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4114', 'grad_norm': '1.237', 'learning_rate': '4.965e-05', 'epoch': '0.3625', 'num_input_tokens_seen': 29468612, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7606', 'grad_norm': '1.379', 'learning_rate': '4.965e-05', 'epoch': '0.3625', 'num_input_tokens_seen': 29470659, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.503', 'grad_norm': '0.9832', 'learning_rate': '4.965e-05', 'epoch': '0.3625', 'num_input_tokens_seen': 29472706, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.524', 'grad_norm': '2.308', 'learning_rate': '4.965e-05', 'epoch': '0.3625', 'num_input_tokens_seen': 29474753, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.122', 'grad_norm': '2.1', 'learning_rate': '4.965e-05', 'epoch': '0.3626', 'num_input_tokens_seen': 29476800, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5017', 'grad_norm': '1.393', 'learning_rate': '4.965e-05', 'epoch': '0.3626', 'num_input_tokens_seen': 29478847, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2643', 'grad_norm': '1.092', 'learning_rate': '4.965e-05', 'epoch': '0.3626', 'num_input_tokens_seen': 29480894, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.406', 'grad_norm': '2.524', 'learning_rate': '4.965e-05', 'epoch': '0.3626', 'num_input_tokens_seen': 29482941, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6864', 'grad_norm': '1.238', 'learning_rate': '4.965e-05', 'epoch': '0.3627', 'num_input_tokens_seen': 29484988, 'train_runtime': '1.492e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1593', 'grad_norm': '0.7617', 'learning_rate': '4.965e-05', 'epoch': '0.3627', 'num_input_tokens_seen': 29487035, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6224', 'grad_norm': '1.215', 'learning_rate': '4.965e-05', 'epoch': '0.3627', 'num_input_tokens_seen': 29489082, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5899', 'grad_norm': '1.38', 'learning_rate': '4.965e-05', 'epoch': '0.3627', 'num_input_tokens_seen': 29491129, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4219', 'grad_norm': '1.118', 'learning_rate': '4.965e-05', 'epoch': '0.3628', 'num_input_tokens_seen': 29493176, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4206', 'grad_norm': '1.054', 'learning_rate': '4.965e-05', 'epoch': '0.3628', 'num_input_tokens_seen': 29495223, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4409', 'grad_norm': '1.227', 'learning_rate': '4.965e-05', 'epoch': '0.3628', 'num_input_tokens_seen': 29497270, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4064', 'grad_norm': '1.276', 'learning_rate': '4.965e-05', 'epoch': '0.3628', 'num_input_tokens_seen': 29499317, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.173', 'grad_norm': '2.15', 'learning_rate': '4.965e-05', 'epoch': '0.3629', 'num_input_tokens_seen': 29501364, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4407', 'grad_norm': '1.398', 'learning_rate': '4.965e-05', 'epoch': '0.3629', 'num_input_tokens_seen': 29503411, 'train_runtime': '1.493e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.776', 'grad_norm': '2.915', 'learning_rate': '4.965e-05', 'epoch': '0.3629', 'num_input_tokens_seen': 29505458, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6939', 'grad_norm': '1.364', 'learning_rate': '4.965e-05', 'epoch': '0.3629', 'num_input_tokens_seen': 29507505, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.211', 'grad_norm': '1.889', 'learning_rate': '4.965e-05', 'epoch': '0.363', 'num_input_tokens_seen': 29509552, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4097', 'grad_norm': '0.9544', 'learning_rate': '4.965e-05', 'epoch': '0.363', 'num_input_tokens_seen': 29511599, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3813', 'grad_norm': '1.088', 'learning_rate': '4.965e-05', 'epoch': '0.363', 'num_input_tokens_seen': 29513646, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2994', 'grad_norm': '0.8867', 'learning_rate': '4.965e-05', 'epoch': '0.363', 'num_input_tokens_seen': 29515693, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.336', 'grad_norm': '0.7344', 'learning_rate': '4.965e-05', 'epoch': '0.3631', 'num_input_tokens_seen': 29517740, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.471', 'grad_norm': '1.056', 'learning_rate': '4.965e-05', 'epoch': '0.3631', 'num_input_tokens_seen': 29519787, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6647', 'grad_norm': '1.479', 'learning_rate': '4.965e-05', 'epoch': '0.3631', 'num_input_tokens_seen': 29521834, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6286', 'grad_norm': '1.379', 'learning_rate': '4.965e-05', 'epoch': '0.3631', 'num_input_tokens_seen': 29523881, 'train_runtime': '1.494e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.301', 'grad_norm': '2.564', 'learning_rate': '4.965e-05', 'epoch': '0.3632', 'num_input_tokens_seen': 29525928, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.518', 'grad_norm': '2.68', 'learning_rate': '4.964e-05', 'epoch': '0.3632', 'num_input_tokens_seen': 29527975, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2866', 'grad_norm': '0.8761', 'learning_rate': '4.964e-05', 'epoch': '0.3632', 'num_input_tokens_seen': 29530022, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5037', 'grad_norm': '1.284', 'learning_rate': '4.964e-05', 'epoch': '0.3632', 'num_input_tokens_seen': 29532069, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3662', 'grad_norm': '1.213', 'learning_rate': '4.964e-05', 'epoch': '0.3633', 'num_input_tokens_seen': 29534116, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5884', 'grad_norm': '1.08', 'learning_rate': '4.964e-05', 'epoch': '0.3633', 'num_input_tokens_seen': 29536163, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.036', 'grad_norm': '1.79', 'learning_rate': '4.964e-05', 'epoch': '0.3633', 'num_input_tokens_seen': 29538210, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2092', 'grad_norm': '0.7157', 'learning_rate': '4.964e-05', 'epoch': '0.3633', 'num_input_tokens_seen': 29540257, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5799', 'grad_norm': '0.9165', 'learning_rate': '4.964e-05', 'epoch': '0.3634', 'num_input_tokens_seen': 29542304, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3327', 'grad_norm': '0.833', 'learning_rate': '4.964e-05', 'epoch': '0.3634', 'num_input_tokens_seen': 29544351, 'train_runtime': '1.495e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.153', 'grad_norm': '2.218', 'learning_rate': '4.964e-05', 'epoch': '0.3634', 'num_input_tokens_seen': 29546398, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.033', 'grad_norm': '2.393', 'learning_rate': '4.964e-05', 'epoch': '0.3634', 'num_input_tokens_seen': 29548445, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8374', 'grad_norm': '1.154', 'learning_rate': '4.964e-05', 'epoch': '0.3635', 'num_input_tokens_seen': 29550492, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2611', 'grad_norm': '0.7974', 'learning_rate': '4.964e-05', 'epoch': '0.3635', 'num_input_tokens_seen': 29552539, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.394', 'grad_norm': '0.8557', 'learning_rate': '4.964e-05', 'epoch': '0.3635', 'num_input_tokens_seen': 29554586, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.523', 'grad_norm': '1.685', 'learning_rate': '4.964e-05', 'epoch': '0.3635', 'num_input_tokens_seen': 29556633, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2871', 'grad_norm': '0.9129', 'learning_rate': '4.964e-05', 'epoch': '0.3636', 'num_input_tokens_seen': 29558680, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6149', 'grad_norm': '1.372', 'learning_rate': '4.964e-05', 'epoch': '0.3636', 'num_input_tokens_seen': 29560727, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7877', 'grad_norm': '1.38', 'learning_rate': '4.964e-05', 'epoch': '0.3636', 'num_input_tokens_seen': 29562774, 'train_runtime': '1.496e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3996', 'grad_norm': '1.044', 'learning_rate': '4.964e-05', 'epoch': '0.3636', 'num_input_tokens_seen': 29564821, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4795', 'grad_norm': '1.126', 'learning_rate': '4.964e-05', 'epoch': '0.3637', 'num_input_tokens_seen': 29566868, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8121', 'grad_norm': '1.877', 'learning_rate': '4.964e-05', 'epoch': '0.3637', 'num_input_tokens_seen': 29568915, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3296', 'grad_norm': '1.008', 'learning_rate': '4.964e-05', 'epoch': '0.3637', 'num_input_tokens_seen': 29570962, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4729', 'grad_norm': '1.287', 'learning_rate': '4.964e-05', 'epoch': '0.3637', 'num_input_tokens_seen': 29573009, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3575', 'grad_norm': '0.9091', 'learning_rate': '4.964e-05', 'epoch': '0.3638', 'num_input_tokens_seen': 29575056, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6653', 'grad_norm': '1.256', 'learning_rate': '4.964e-05', 'epoch': '0.3638', 'num_input_tokens_seen': 29577103, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.4', 'grad_norm': '2.633', 'learning_rate': '4.964e-05', 'epoch': '0.3638', 'num_input_tokens_seen': 29579150, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7415', 'grad_norm': '1.614', 'learning_rate': '4.964e-05', 'epoch': '0.3638', 'num_input_tokens_seen': 29581197, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5126', 'grad_norm': '1.015', 'learning_rate': '4.964e-05', 'epoch': '0.3639', 'num_input_tokens_seen': 29583244, 'train_runtime': '1.497e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7068', 'grad_norm': '1.241', 'learning_rate': '4.964e-05', 'epoch': '0.3639', 'num_input_tokens_seen': 29585291, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.536', 'grad_norm': '1.096', 'learning_rate': '4.964e-05', 'epoch': '0.3639', 'num_input_tokens_seen': 29587338, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.38', 'grad_norm': '2.512', 'learning_rate': '4.964e-05', 'epoch': '0.3639', 'num_input_tokens_seen': 29589385, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2714', 'grad_norm': '0.8209', 'learning_rate': '4.964e-05', 'epoch': '0.364', 'num_input_tokens_seen': 29591432, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5676', 'grad_norm': '1.122', 'learning_rate': '4.964e-05', 'epoch': '0.364', 'num_input_tokens_seen': 29593479, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6599', 'grad_norm': '1.334', 'learning_rate': '4.964e-05', 'epoch': '0.364', 'num_input_tokens_seen': 29595526, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5514', 'grad_norm': '1.176', 'learning_rate': '4.964e-05', 'epoch': '0.3641', 'num_input_tokens_seen': 29597573, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.213', 'grad_norm': '2.136', 'learning_rate': '4.964e-05', 'epoch': '0.3641', 'num_input_tokens_seen': 29599620, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4351', 'grad_norm': '0.9148', 'learning_rate': '4.964e-05', 'epoch': '0.3641', 'num_input_tokens_seen': 29601667, 'train_runtime': '1.498e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9067', 'grad_norm': '1.701', 'learning_rate': '4.964e-05', 'epoch': '0.3641', 'num_input_tokens_seen': 29603714, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2331', 'grad_norm': '0.9501', 'learning_rate': '4.964e-05', 'epoch': '0.3642', 'num_input_tokens_seen': 29605761, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6578', 'grad_norm': '1.319', 'learning_rate': '4.964e-05', 'epoch': '0.3642', 'num_input_tokens_seen': 29607808, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4224', 'grad_norm': '1.198', 'learning_rate': '4.964e-05', 'epoch': '0.3642', 'num_input_tokens_seen': 29609855, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4248', 'grad_norm': '1.037', 'learning_rate': '4.964e-05', 'epoch': '0.3642', 'num_input_tokens_seen': 29611902, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.259', 'grad_norm': '0.8974', 'learning_rate': '4.964e-05', 'epoch': '0.3643', 'num_input_tokens_seen': 29613949, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5673', 'grad_norm': '1.12', 'learning_rate': '4.964e-05', 'epoch': '0.3643', 'num_input_tokens_seen': 29615996, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3166', 'grad_norm': '0.7935', 'learning_rate': '4.964e-05', 'epoch': '0.3643', 'num_input_tokens_seen': 29618043, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.482', 'grad_norm': '1.097', 'learning_rate': '4.964e-05', 'epoch': '0.3643', 'num_input_tokens_seen': 29620090, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5765', 'grad_norm': '1.038', 'learning_rate': '4.964e-05', 'epoch': '0.3644', 'num_input_tokens_seen': 29622137, 'train_runtime': '1.499e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1925', 'grad_norm': '0.8388', 'learning_rate': '4.964e-05', 'epoch': '0.3644', 'num_input_tokens_seen': 29624184, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5384', 'grad_norm': '1.153', 'learning_rate': '4.964e-05', 'epoch': '0.3644', 'num_input_tokens_seen': 29626231, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.138', 'grad_norm': '2.085', 'learning_rate': '4.964e-05', 'epoch': '0.3644', 'num_input_tokens_seen': 29628278, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.22', 'grad_norm': '2.295', 'learning_rate': '4.964e-05', 'epoch': '0.3645', 'num_input_tokens_seen': 29630325, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.543', 'grad_norm': '1.728', 'learning_rate': '4.964e-05', 'epoch': '0.3645', 'num_input_tokens_seen': 29632372, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4435', 'grad_norm': '0.9304', 'learning_rate': '4.964e-05', 'epoch': '0.3645', 'num_input_tokens_seen': 29634419, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5252', 'grad_norm': '1.578', 'learning_rate': '4.964e-05', 'epoch': '0.3645', 'num_input_tokens_seen': 29636466, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.062', 'grad_norm': '1.646', 'learning_rate': '4.964e-05', 'epoch': '0.3646', 'num_input_tokens_seen': 29638513, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3295', 'grad_norm': '0.874', 'learning_rate': '4.964e-05', 'epoch': '0.3646', 'num_input_tokens_seen': 29640560, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3432', 'grad_norm': '1.004', 'learning_rate': '4.964e-05', 'epoch': '0.3646', 'num_input_tokens_seen': 29642607, 'train_runtime': '1.5e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3221', 'grad_norm': '0.8193', 'learning_rate': '4.964e-05', 'epoch': '0.3646', 'num_input_tokens_seen': 29644654, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6435', 'grad_norm': '1.545', 'learning_rate': '4.964e-05', 'epoch': '0.3647', 'num_input_tokens_seen': 29646701, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3249', 'grad_norm': '0.8462', 'learning_rate': '4.964e-05', 'epoch': '0.3647', 'num_input_tokens_seen': 29648748, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2843', 'grad_norm': '0.8541', 'learning_rate': '4.964e-05', 'epoch': '0.3647', 'num_input_tokens_seen': 29650795, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3248', 'grad_norm': '0.82', 'learning_rate': '4.964e-05', 'epoch': '0.3647', 'num_input_tokens_seen': 29652842, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.066', 'grad_norm': '1.399', 'learning_rate': '4.964e-05', 'epoch': '0.3648', 'num_input_tokens_seen': 29654889, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4947', 'grad_norm': '1.234', 'learning_rate': '4.964e-05', 'epoch': '0.3648', 'num_input_tokens_seen': 29656936, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.285', 'grad_norm': '2.215', 'learning_rate': '4.964e-05', 'epoch': '0.3648', 'num_input_tokens_seen': 29658983, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8547', 'grad_norm': '1.546', 'learning_rate': '4.964e-05', 'epoch': '0.3648', 'num_input_tokens_seen': 29661030, 'train_runtime': '1.501e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8635', 'grad_norm': '1.581', 'learning_rate': '4.964e-05', 'epoch': '0.3649', 'num_input_tokens_seen': 29663077, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5276', 'grad_norm': '0.9757', 'learning_rate': '4.964e-05', 'epoch': '0.3649', 'num_input_tokens_seen': 29665124, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.046', 'grad_norm': '1.551', 'learning_rate': '4.964e-05', 'epoch': '0.3649', 'num_input_tokens_seen': 29667171, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3765', 'grad_norm': '0.9244', 'learning_rate': '4.964e-05', 'epoch': '0.3649', 'num_input_tokens_seen': 29669218, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6866', 'grad_norm': '1.204', 'learning_rate': '4.964e-05', 'epoch': '0.365', 'num_input_tokens_seen': 29671265, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8123', 'grad_norm': '1.379', 'learning_rate': '4.964e-05', 'epoch': '0.365', 'num_input_tokens_seen': 29673312, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8546', 'grad_norm': '1.956', 'learning_rate': '4.964e-05', 'epoch': '0.365', 'num_input_tokens_seen': 29675359, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.19', 'grad_norm': '2.402', 'learning_rate': '4.964e-05', 'epoch': '0.365', 'num_input_tokens_seen': 29677406, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.706', 'grad_norm': '1.625', 'learning_rate': '4.964e-05', 'epoch': '0.3651', 'num_input_tokens_seen': 29679453, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.901', 'grad_norm': '2.943', 'learning_rate': '4.964e-05', 'epoch': '0.3651', 'num_input_tokens_seen': 29681500, 'train_runtime': '1.502e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3355', 'grad_norm': '1.038', 'learning_rate': '4.964e-05', 'epoch': '0.3651', 'num_input_tokens_seen': 29683547, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6286', 'grad_norm': '1.344', 'learning_rate': '4.964e-05', 'epoch': '0.3651', 'num_input_tokens_seen': 29685594, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7041', 'grad_norm': '1.228', 'learning_rate': '4.964e-05', 'epoch': '0.3652', 'num_input_tokens_seen': 29687641, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.066', 'grad_norm': '1.666', 'learning_rate': '4.964e-05', 'epoch': '0.3652', 'num_input_tokens_seen': 29689688, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6411', 'grad_norm': '1.617', 'learning_rate': '4.964e-05', 'epoch': '0.3652', 'num_input_tokens_seen': 29691735, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.533', 'grad_norm': '2.296', 'learning_rate': '4.964e-05', 'epoch': '0.3652', 'num_input_tokens_seen': 29693782, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4483', 'grad_norm': '1.157', 'learning_rate': '4.964e-05', 'epoch': '0.3653', 'num_input_tokens_seen': 29695829, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.148', 'grad_norm': '1.476', 'learning_rate': '4.964e-05', 'epoch': '0.3653', 'num_input_tokens_seen': 29697876, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7875', 'grad_norm': '1.962', 'learning_rate': '4.964e-05', 'epoch': '0.3653', 'num_input_tokens_seen': 29699923, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2055', 'grad_norm': '0.8645', 'learning_rate': '4.964e-05', 'epoch': '0.3653', 'num_input_tokens_seen': 29701970, 'train_runtime': '1.503e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3462', 'grad_norm': '0.9472', 'learning_rate': '4.964e-05', 'epoch': '0.3654', 'num_input_tokens_seen': 29704017, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2196', 'grad_norm': '0.7103', 'learning_rate': '4.964e-05', 'epoch': '0.3654', 'num_input_tokens_seen': 29706064, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1997', 'grad_norm': '1.149', 'learning_rate': '4.964e-05', 'epoch': '0.3654', 'num_input_tokens_seen': 29708111, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2226', 'grad_norm': '0.8006', 'learning_rate': '4.964e-05', 'epoch': '0.3654', 'num_input_tokens_seen': 29710158, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2946', 'grad_norm': '0.9395', 'learning_rate': '4.964e-05', 'epoch': '0.3655', 'num_input_tokens_seen': 29712205, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.629', 'grad_norm': '3.041', 'learning_rate': '4.964e-05', 'epoch': '0.3655', 'num_input_tokens_seen': 29714252, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6369', 'grad_norm': '1.449', 'learning_rate': '4.964e-05', 'epoch': '0.3655', 'num_input_tokens_seen': 29716299, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.345', 'grad_norm': '2.488', 'learning_rate': '4.964e-05', 'epoch': '0.3655', 'num_input_tokens_seen': 29718346, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5688', 'grad_norm': '1.472', 'learning_rate': '4.964e-05', 'epoch': '0.3656', 'num_input_tokens_seen': 29720393, 'train_runtime': '1.504e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4734', 'grad_norm': '1.226', 'learning_rate': '4.964e-05', 'epoch': '0.3656', 'num_input_tokens_seen': 29722440, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5749', 'grad_norm': '1.101', 'learning_rate': '4.964e-05', 'epoch': '0.3656', 'num_input_tokens_seen': 29724487, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.09', 'grad_norm': '2.291', 'learning_rate': '4.964e-05', 'epoch': '0.3656', 'num_input_tokens_seen': 29726534, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2742', 'grad_norm': '0.767', 'learning_rate': '4.964e-05', 'epoch': '0.3657', 'num_input_tokens_seen': 29728581, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2885', 'grad_norm': '0.9725', 'learning_rate': '4.964e-05', 'epoch': '0.3657', 'num_input_tokens_seen': 29730628, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2735', 'grad_norm': '0.821', 'learning_rate': '4.964e-05', 'epoch': '0.3657', 'num_input_tokens_seen': 29732675, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9858', 'grad_norm': '2.097', 'learning_rate': '4.964e-05', 'epoch': '0.3657', 'num_input_tokens_seen': 29734722, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.742', 'grad_norm': '1.236', 'learning_rate': '4.964e-05', 'epoch': '0.3658', 'num_input_tokens_seen': 29736769, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3865', 'grad_norm': '1.12', 'learning_rate': '4.964e-05', 'epoch': '0.3658', 'num_input_tokens_seen': 29738816, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5247', 'grad_norm': '1.153', 'learning_rate': '4.964e-05', 'epoch': '0.3658', 'num_input_tokens_seen': 29740863, 'train_runtime': '1.505e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8508', 'grad_norm': '1.967', 'learning_rate': '4.964e-05', 'epoch': '0.3658', 'num_input_tokens_seen': 29742910, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.558', 'grad_norm': '2.934', 'learning_rate': '4.964e-05', 'epoch': '0.3659', 'num_input_tokens_seen': 29744957, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.004', 'grad_norm': '1.613', 'learning_rate': '4.964e-05', 'epoch': '0.3659', 'num_input_tokens_seen': 29747004, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6516', 'grad_norm': '1.118', 'learning_rate': '4.964e-05', 'epoch': '0.3659', 'num_input_tokens_seen': 29749051, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1706', 'grad_norm': '0.8293', 'learning_rate': '4.964e-05', 'epoch': '0.3659', 'num_input_tokens_seen': 29751098, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6497', 'grad_norm': '1.5', 'learning_rate': '4.964e-05', 'epoch': '0.366', 'num_input_tokens_seen': 29753145, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4453', 'grad_norm': '0.9928', 'learning_rate': '4.964e-05', 'epoch': '0.366', 'num_input_tokens_seen': 29755192, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5835', 'grad_norm': '1.152', 'learning_rate': '4.964e-05', 'epoch': '0.366', 'num_input_tokens_seen': 29757239, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.206', 'grad_norm': '3.328', 'learning_rate': '4.964e-05', 'epoch': '0.366', 'num_input_tokens_seen': 29759286, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7225', 'grad_norm': '1.011', 'learning_rate': '4.964e-05', 'epoch': '0.3661', 'num_input_tokens_seen': 29761333, 'train_runtime': '1.506e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.093', 'grad_norm': '2.013', 'learning_rate': '4.964e-05', 'epoch': '0.3661', 'num_input_tokens_seen': 29763380, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2029', 'grad_norm': '0.8855', 'learning_rate': '4.964e-05', 'epoch': '0.3661', 'num_input_tokens_seen': 29765427, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.594', 'grad_norm': '1.041', 'learning_rate': '4.964e-05', 'epoch': '0.3661', 'num_input_tokens_seen': 29767474, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2956', 'grad_norm': '0.8642', 'learning_rate': '4.964e-05', 'epoch': '0.3662', 'num_input_tokens_seen': 29769521, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4895', 'grad_norm': '1.134', 'learning_rate': '4.964e-05', 'epoch': '0.3662', 'num_input_tokens_seen': 29771568, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.752', 'grad_norm': '1.459', 'learning_rate': '4.964e-05', 'epoch': '0.3662', 'num_input_tokens_seen': 29773615, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4884', 'grad_norm': '1.051', 'learning_rate': '4.964e-05', 'epoch': '0.3662', 'num_input_tokens_seen': 29775662, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3494', 'grad_norm': '0.9106', 'learning_rate': '4.964e-05', 'epoch': '0.3663', 'num_input_tokens_seen': 29777709, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.39', 'grad_norm': '1.171', 'learning_rate': '4.964e-05', 'epoch': '0.3663', 'num_input_tokens_seen': 29779756, 'train_runtime': '1.507e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.507', 'grad_norm': '1.148', 'learning_rate': '4.964e-05', 'epoch': '0.3663', 'num_input_tokens_seen': 29781803, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7481', 'grad_norm': '1.316', 'learning_rate': '4.964e-05', 'epoch': '0.3663', 'num_input_tokens_seen': 29783850, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6068', 'grad_norm': '1.343', 'learning_rate': '4.964e-05', 'epoch': '0.3664', 'num_input_tokens_seen': 29785897, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8472', 'grad_norm': '1.556', 'learning_rate': '4.964e-05', 'epoch': '0.3664', 'num_input_tokens_seen': 29787944, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5957', 'grad_norm': '1.226', 'learning_rate': '4.964e-05', 'epoch': '0.3664', 'num_input_tokens_seen': 29789991, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4143', 'grad_norm': '1.193', 'learning_rate': '4.964e-05', 'epoch': '0.3664', 'num_input_tokens_seen': 29792038, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7012', 'grad_norm': '0.9908', 'learning_rate': '4.964e-05', 'epoch': '0.3665', 'num_input_tokens_seen': 29794085, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6958', 'grad_norm': '1.508', 'learning_rate': '4.964e-05', 'epoch': '0.3665', 'num_input_tokens_seen': 29796132, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6695', 'grad_norm': '1.072', 'learning_rate': '4.964e-05', 'epoch': '0.3665', 'num_input_tokens_seen': 29798179, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4238', 'grad_norm': '1.208', 'learning_rate': '4.964e-05', 'epoch': '0.3665', 'num_input_tokens_seen': 29800226, 'train_runtime': '1.508e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8204', 'grad_norm': '1.264', 'learning_rate': '4.964e-05', 'epoch': '0.3666', 'num_input_tokens_seen': 29802273, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8888', 'grad_norm': '1.594', 'learning_rate': '4.964e-05', 'epoch': '0.3666', 'num_input_tokens_seen': 29804320, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5321', 'grad_norm': '1.261', 'learning_rate': '4.964e-05', 'epoch': '0.3666', 'num_input_tokens_seen': 29806367, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4123', 'grad_norm': '1.057', 'learning_rate': '4.964e-05', 'epoch': '0.3666', 'num_input_tokens_seen': 29808414, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8479', 'grad_norm': '1.374', 'learning_rate': '4.964e-05', 'epoch': '0.3667', 'num_input_tokens_seen': 29810461, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9879', 'grad_norm': '1.714', 'learning_rate': '4.964e-05', 'epoch': '0.3667', 'num_input_tokens_seen': 29812508, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.353', 'grad_norm': '0.8189', 'learning_rate': '4.964e-05', 'epoch': '0.3667', 'num_input_tokens_seen': 29814555, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3731', 'grad_norm': '0.7727', 'learning_rate': '4.964e-05', 'epoch': '0.3667', 'num_input_tokens_seen': 29816602, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7267', 'grad_norm': '1.57', 'learning_rate': '4.964e-05', 'epoch': '0.3668', 'num_input_tokens_seen': 29818649, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.276', 'grad_norm': '2.289', 'learning_rate': '4.964e-05', 'epoch': '0.3668', 'num_input_tokens_seen': 29820696, 'train_runtime': '1.509e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4248', 'grad_norm': '1.164', 'learning_rate': '4.964e-05', 'epoch': '0.3668', 'num_input_tokens_seen': 29822743, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9709', 'grad_norm': '1.842', 'learning_rate': '4.964e-05', 'epoch': '0.3668', 'num_input_tokens_seen': 29824790, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3623', 'grad_norm': '0.8388', 'learning_rate': '4.964e-05', 'epoch': '0.3669', 'num_input_tokens_seen': 29826837, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5977', 'grad_norm': '1.259', 'learning_rate': '4.963e-05', 'epoch': '0.3669', 'num_input_tokens_seen': 29828884, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5304', 'grad_norm': '1.133', 'learning_rate': '4.963e-05', 'epoch': '0.3669', 'num_input_tokens_seen': 29830931, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3738', 'grad_norm': '0.825', 'learning_rate': '4.963e-05', 'epoch': '0.3669', 'num_input_tokens_seen': 29832978, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8616', 'grad_norm': '1.024', 'learning_rate': '4.963e-05', 'epoch': '0.367', 'num_input_tokens_seen': 29835025, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3114', 'grad_norm': '0.9711', 'learning_rate': '4.963e-05', 'epoch': '0.367', 'num_input_tokens_seen': 29837072, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6963', 'grad_norm': '1.441', 'learning_rate': '4.963e-05', 'epoch': '0.367', 'num_input_tokens_seen': 29839119, 'train_runtime': '1.51e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4466', 'grad_norm': '0.9311', 'learning_rate': '4.963e-05', 'epoch': '0.367', 'num_input_tokens_seen': 29841166, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.458', 'grad_norm': '2.93', 'learning_rate': '4.963e-05', 'epoch': '0.3671', 'num_input_tokens_seen': 29843213, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5649', 'grad_norm': '1.134', 'learning_rate': '4.963e-05', 'epoch': '0.3671', 'num_input_tokens_seen': 29845260, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5586', 'grad_norm': '1.424', 'learning_rate': '4.963e-05', 'epoch': '0.3671', 'num_input_tokens_seen': 29847307, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3268', 'grad_norm': '1.08', 'learning_rate': '4.963e-05', 'epoch': '0.3671', 'num_input_tokens_seen': 29849354, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.062', 'grad_norm': '2.338', 'learning_rate': '4.963e-05', 'epoch': '0.3672', 'num_input_tokens_seen': 29851401, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4349', 'grad_norm': '1.13', 'learning_rate': '4.963e-05', 'epoch': '0.3672', 'num_input_tokens_seen': 29853448, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6784', 'grad_norm': '1.583', 'learning_rate': '4.963e-05', 'epoch': '0.3672', 'num_input_tokens_seen': 29855495, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.573', 'grad_norm': '2.592', 'learning_rate': '4.963e-05', 'epoch': '0.3672', 'num_input_tokens_seen': 29857542, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3054', 'grad_norm': '0.726', 'learning_rate': '4.963e-05', 'epoch': '0.3673', 'num_input_tokens_seen': 29859589, 'train_runtime': '1.511e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3482', 'grad_norm': '0.8795', 'learning_rate': '4.963e-05', 'epoch': '0.3673', 'num_input_tokens_seen': 29861636, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.407', 'grad_norm': '0.8816', 'learning_rate': '4.963e-05', 'epoch': '0.3673', 'num_input_tokens_seen': 29863683, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8143', 'grad_norm': '2.028', 'learning_rate': '4.963e-05', 'epoch': '0.3673', 'num_input_tokens_seen': 29865730, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2541', 'grad_norm': '0.727', 'learning_rate': '4.963e-05', 'epoch': '0.3674', 'num_input_tokens_seen': 29867777, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2145', 'grad_norm': '0.7278', 'learning_rate': '4.963e-05', 'epoch': '0.3674', 'num_input_tokens_seen': 29869824, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3088', 'grad_norm': '0.9595', 'learning_rate': '4.963e-05', 'epoch': '0.3674', 'num_input_tokens_seen': 29871871, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2919', 'grad_norm': '0.7666', 'learning_rate': '4.963e-05', 'epoch': '0.3674', 'num_input_tokens_seen': 29873918, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7679', 'grad_norm': '1.066', 'learning_rate': '4.963e-05', 'epoch': '0.3675', 'num_input_tokens_seen': 29875965, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6236', 'grad_norm': '1.186', 'learning_rate': '4.963e-05', 'epoch': '0.3675', 'num_input_tokens_seen': 29878012, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5436', 'grad_norm': '1.172', 'learning_rate': '4.963e-05', 'epoch': '0.3675', 'num_input_tokens_seen': 29880059, 'train_runtime': '1.512e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5428', 'grad_norm': '0.8824', 'learning_rate': '4.963e-05', 'epoch': '0.3676', 'num_input_tokens_seen': 29882106, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.143', 'grad_norm': '1.849', 'learning_rate': '4.963e-05', 'epoch': '0.3676', 'num_input_tokens_seen': 29884153, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6011', 'grad_norm': '1.342', 'learning_rate': '4.963e-05', 'epoch': '0.3676', 'num_input_tokens_seen': 29886200, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3881', 'grad_norm': '0.9043', 'learning_rate': '4.963e-05', 'epoch': '0.3676', 'num_input_tokens_seen': 29888247, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.278', 'grad_norm': '1.872', 'learning_rate': '4.963e-05', 'epoch': '0.3677', 'num_input_tokens_seen': 29890294, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.487', 'grad_norm': '2.261', 'learning_rate': '4.963e-05', 'epoch': '0.3677', 'num_input_tokens_seen': 29892341, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2348', 'grad_norm': '0.7928', 'learning_rate': '4.963e-05', 'epoch': '0.3677', 'num_input_tokens_seen': 29894388, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3929', 'grad_norm': '0.8738', 'learning_rate': '4.963e-05', 'epoch': '0.3677', 'num_input_tokens_seen': 29896435, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.099', 'grad_norm': '2.029', 'learning_rate': '4.963e-05', 'epoch': '0.3678', 'num_input_tokens_seen': 29898482, 'train_runtime': '1.513e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3606', 'grad_norm': '0.8491', 'learning_rate': '4.963e-05', 'epoch': '0.3678', 'num_input_tokens_seen': 29900529, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.332', 'grad_norm': '0.9852', 'learning_rate': '4.963e-05', 'epoch': '0.3678', 'num_input_tokens_seen': 29902576, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5272', 'grad_norm': '1.163', 'learning_rate': '4.963e-05', 'epoch': '0.3678', 'num_input_tokens_seen': 29904623, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7243', 'grad_norm': '1.43', 'learning_rate': '4.963e-05', 'epoch': '0.3679', 'num_input_tokens_seen': 29906670, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7065', 'grad_norm': '1.036', 'learning_rate': '4.963e-05', 'epoch': '0.3679', 'num_input_tokens_seen': 29908717, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5014', 'grad_norm': '1.34', 'learning_rate': '4.963e-05', 'epoch': '0.3679', 'num_input_tokens_seen': 29910764, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2589', 'grad_norm': '0.8556', 'learning_rate': '4.963e-05', 'epoch': '0.3679', 'num_input_tokens_seen': 29912811, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6848', 'grad_norm': '1.204', 'learning_rate': '4.963e-05', 'epoch': '0.368', 'num_input_tokens_seen': 29914858, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4', 'grad_norm': '1.02', 'learning_rate': '4.963e-05', 'epoch': '0.368', 'num_input_tokens_seen': 29916905, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6169', 'grad_norm': '1.201', 'learning_rate': '4.963e-05', 'epoch': '0.368', 'num_input_tokens_seen': 29918952, 'train_runtime': '1.514e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2827', 'grad_norm': '0.7698', 'learning_rate': '4.963e-05', 'epoch': '0.368', 'num_input_tokens_seen': 29920999, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6916', 'grad_norm': '1.654', 'learning_rate': '4.963e-05', 'epoch': '0.3681', 'num_input_tokens_seen': 29923046, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6969', 'grad_norm': '1.462', 'learning_rate': '4.963e-05', 'epoch': '0.3681', 'num_input_tokens_seen': 29925093, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2861', 'grad_norm': '0.7324', 'learning_rate': '4.963e-05', 'epoch': '0.3681', 'num_input_tokens_seen': 29927140, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.495', 'grad_norm': '1.073', 'learning_rate': '4.963e-05', 'epoch': '0.3681', 'num_input_tokens_seen': 29929187, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6097', 'grad_norm': '1.504', 'learning_rate': '4.963e-05', 'epoch': '0.3682', 'num_input_tokens_seen': 29931234, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.564', 'grad_norm': '2.865', 'learning_rate': '4.963e-05', 'epoch': '0.3682', 'num_input_tokens_seen': 29933281, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6801', 'grad_norm': '1.177', 'learning_rate': '4.963e-05', 'epoch': '0.3682', 'num_input_tokens_seen': 29935328, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3811', 'grad_norm': '0.8705', 'learning_rate': '4.963e-05', 'epoch': '0.3682', 'num_input_tokens_seen': 29937375, 'train_runtime': '1.515e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.006', 'grad_norm': '1.404', 'learning_rate': '4.963e-05', 'epoch': '0.3683', 'num_input_tokens_seen': 29939422, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.259', 'grad_norm': '0.8825', 'learning_rate': '4.963e-05', 'epoch': '0.3683', 'num_input_tokens_seen': 29941469, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.167', 'grad_norm': '2.285', 'learning_rate': '4.963e-05', 'epoch': '0.3683', 'num_input_tokens_seen': 29943516, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5783', 'grad_norm': '0.9857', 'learning_rate': '4.963e-05', 'epoch': '0.3683', 'num_input_tokens_seen': 29945563, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6576', 'grad_norm': '1.324', 'learning_rate': '4.963e-05', 'epoch': '0.3684', 'num_input_tokens_seen': 29947610, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.271', 'grad_norm': '1.003', 'learning_rate': '4.963e-05', 'epoch': '0.3684', 'num_input_tokens_seen': 29949657, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2912', 'grad_norm': '0.8783', 'learning_rate': '4.963e-05', 'epoch': '0.3684', 'num_input_tokens_seen': 29951704, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8313', 'grad_norm': '2.251', 'learning_rate': '4.963e-05', 'epoch': '0.3684', 'num_input_tokens_seen': 29953751, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9842', 'grad_norm': '2.3', 'learning_rate': '4.963e-05', 'epoch': '0.3685', 'num_input_tokens_seen': 29955798, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2385', 'grad_norm': '0.8308', 'learning_rate': '4.963e-05', 'epoch': '0.3685', 'num_input_tokens_seen': 29957845, 'train_runtime': '1.516e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.684', 'grad_norm': '2.876', 'learning_rate': '4.963e-05', 'epoch': '0.3685', 'num_input_tokens_seen': 29959892, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8944', 'grad_norm': '1.383', 'learning_rate': '4.963e-05', 'epoch': '0.3685', 'num_input_tokens_seen': 29961939, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3986', 'grad_norm': '0.7214', 'learning_rate': '4.963e-05', 'epoch': '0.3686', 'num_input_tokens_seen': 29963986, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6479', 'grad_norm': '1.644', 'learning_rate': '4.963e-05', 'epoch': '0.3686', 'num_input_tokens_seen': 29966033, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5684', 'grad_norm': '1.02', 'learning_rate': '4.963e-05', 'epoch': '0.3686', 'num_input_tokens_seen': 29968080, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.219', 'grad_norm': '2.451', 'learning_rate': '4.963e-05', 'epoch': '0.3686', 'num_input_tokens_seen': 29970127, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9892', 'grad_norm': '2.02', 'learning_rate': '4.963e-05', 'epoch': '0.3687', 'num_input_tokens_seen': 29972174, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4992', 'grad_norm': '1.388', 'learning_rate': '4.963e-05', 'epoch': '0.3687', 'num_input_tokens_seen': 29974221, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7794', 'grad_norm': '1.009', 'learning_rate': '4.963e-05', 'epoch': '0.3687', 'num_input_tokens_seen': 29976268, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.497', 'grad_norm': '1.038', 'learning_rate': '4.963e-05', 'epoch': '0.3687', 'num_input_tokens_seen': 29978315, 'train_runtime': '1.517e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3731', 'grad_norm': '0.8096', 'learning_rate': '4.963e-05', 'epoch': '0.3688', 'num_input_tokens_seen': 29980362, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.36', 'grad_norm': '1.001', 'learning_rate': '4.963e-05', 'epoch': '0.3688', 'num_input_tokens_seen': 29982409, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2582', 'grad_norm': '0.8833', 'learning_rate': '4.963e-05', 'epoch': '0.3688', 'num_input_tokens_seen': 29984456, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1617', 'grad_norm': '0.7639', 'learning_rate': '4.963e-05', 'epoch': '0.3688', 'num_input_tokens_seen': 29986503, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2469', 'grad_norm': '0.8496', 'learning_rate': '4.963e-05', 'epoch': '0.3689', 'num_input_tokens_seen': 29988550, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.181', 'grad_norm': '0.8132', 'learning_rate': '4.963e-05', 'epoch': '0.3689', 'num_input_tokens_seen': 29990597, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9376', 'grad_norm': '1.886', 'learning_rate': '4.963e-05', 'epoch': '0.3689', 'num_input_tokens_seen': 29992644, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.025', 'grad_norm': '2.929', 'learning_rate': '4.963e-05', 'epoch': '0.3689', 'num_input_tokens_seen': 29994691, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6095', 'grad_norm': '1.368', 'learning_rate': '4.963e-05', 'epoch': '0.369', 'num_input_tokens_seen': 29996738, 'train_runtime': '1.518e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4674', 'grad_norm': '1.255', 'learning_rate': '4.963e-05', 'epoch': '0.369', 'num_input_tokens_seen': 29998785, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5398', 'grad_norm': '1.04', 'learning_rate': '4.963e-05', 'epoch': '0.369', 'num_input_tokens_seen': 30000832, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9287', 'grad_norm': '1.35', 'learning_rate': '4.963e-05', 'epoch': '0.369', 'num_input_tokens_seen': 30002879, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3486', 'grad_norm': '0.9206', 'learning_rate': '4.963e-05', 'epoch': '0.3691', 'num_input_tokens_seen': 30004926, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7053', 'grad_norm': '1.189', 'learning_rate': '4.963e-05', 'epoch': '0.3691', 'num_input_tokens_seen': 30006973, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6414', 'grad_norm': '1.785', 'learning_rate': '4.963e-05', 'epoch': '0.3691', 'num_input_tokens_seen': 30009020, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.568', 'grad_norm': '2.241', 'learning_rate': '4.963e-05', 'epoch': '0.3691', 'num_input_tokens_seen': 30011067, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.314', 'grad_norm': '2.273', 'learning_rate': '4.963e-05', 'epoch': '0.3692', 'num_input_tokens_seen': 30013114, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2789', 'grad_norm': '0.823', 'learning_rate': '4.963e-05', 'epoch': '0.3692', 'num_input_tokens_seen': 30015161, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3447', 'grad_norm': '0.8382', 'learning_rate': '4.963e-05', 'epoch': '0.3692', 'num_input_tokens_seen': 30017208, 'train_runtime': '1.519e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1811', 'grad_norm': '0.9481', 'learning_rate': '4.963e-05', 'epoch': '0.3692', 'num_input_tokens_seen': 30019255, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4676', 'grad_norm': '1.354', 'learning_rate': '4.963e-05', 'epoch': '0.3693', 'num_input_tokens_seen': 30021302, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3468', 'grad_norm': '0.8353', 'learning_rate': '4.963e-05', 'epoch': '0.3693', 'num_input_tokens_seen': 30023349, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8566', 'grad_norm': '1.464', 'learning_rate': '4.963e-05', 'epoch': '0.3693', 'num_input_tokens_seen': 30025396, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.157', 'grad_norm': '2.07', 'learning_rate': '4.963e-05', 'epoch': '0.3693', 'num_input_tokens_seen': 30027443, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3234', 'grad_norm': '0.872', 'learning_rate': '4.963e-05', 'epoch': '0.3694', 'num_input_tokens_seen': 30029490, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5589', 'grad_norm': '0.9263', 'learning_rate': '4.963e-05', 'epoch': '0.3694', 'num_input_tokens_seen': 30031537, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3523', 'grad_norm': '0.6585', 'learning_rate': '4.963e-05', 'epoch': '0.3694', 'num_input_tokens_seen': 30033584, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3447', 'grad_norm': '0.8035', 'learning_rate': '4.963e-05', 'epoch': '0.3694', 'num_input_tokens_seen': 30035631, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3786', 'grad_norm': '0.8889', 'learning_rate': '4.963e-05', 'epoch': '0.3695', 'num_input_tokens_seen': 30037678, 'train_runtime': '1.52e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.007', 'grad_norm': '1.409', 'learning_rate': '4.963e-05', 'epoch': '0.3695', 'num_input_tokens_seen': 30039725, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3096', 'grad_norm': '0.8289', 'learning_rate': '4.963e-05', 'epoch': '0.3695', 'num_input_tokens_seen': 30041772, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.345', 'grad_norm': '2.618', 'learning_rate': '4.963e-05', 'epoch': '0.3695', 'num_input_tokens_seen': 30043819, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.124', 'grad_norm': '2.418', 'learning_rate': '4.963e-05', 'epoch': '0.3696', 'num_input_tokens_seen': 30045866, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8205', 'grad_norm': '1.967', 'learning_rate': '4.963e-05', 'epoch': '0.3696', 'num_input_tokens_seen': 30047913, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6849', 'grad_norm': '1.448', 'learning_rate': '4.963e-05', 'epoch': '0.3696', 'num_input_tokens_seen': 30049960, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.263', 'grad_norm': '1.763', 'learning_rate': '4.963e-05', 'epoch': '0.3696', 'num_input_tokens_seen': 30052007, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3937', 'grad_norm': '0.7822', 'learning_rate': '4.963e-05', 'epoch': '0.3697', 'num_input_tokens_seen': 30054054, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2686', 'grad_norm': '0.8279', 'learning_rate': '4.963e-05', 'epoch': '0.3697', 'num_input_tokens_seen': 30056101, 'train_runtime': '1.521e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6702', 'grad_norm': '1.312', 'learning_rate': '4.963e-05', 'epoch': '0.3697', 'num_input_tokens_seen': 30058148, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8098', 'grad_norm': '1.779', 'learning_rate': '4.963e-05', 'epoch': '0.3697', 'num_input_tokens_seen': 30060195, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6938', 'grad_norm': '1.058', 'learning_rate': '4.963e-05', 'epoch': '0.3698', 'num_input_tokens_seen': 30062242, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3845', 'grad_norm': '0.8379', 'learning_rate': '4.963e-05', 'epoch': '0.3698', 'num_input_tokens_seen': 30064289, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5885', 'grad_norm': '1.42', 'learning_rate': '4.963e-05', 'epoch': '0.3698', 'num_input_tokens_seen': 30066336, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3899', 'grad_norm': '0.9824', 'learning_rate': '4.963e-05', 'epoch': '0.3698', 'num_input_tokens_seen': 30068383, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6665', 'grad_norm': '1.798', 'learning_rate': '4.963e-05', 'epoch': '0.3699', 'num_input_tokens_seen': 30070430, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.2', 'grad_norm': '1.974', 'learning_rate': '4.963e-05', 'epoch': '0.3699', 'num_input_tokens_seen': 30072477, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6798', 'grad_norm': '1.19', 'learning_rate': '4.963e-05', 'epoch': '0.3699', 'num_input_tokens_seen': 30074524, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2705', 'grad_norm': '0.7998', 'learning_rate': '4.963e-05', 'epoch': '0.3699', 'num_input_tokens_seen': 30076571, 'train_runtime': '1.522e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5357', 'grad_norm': '1.078', 'learning_rate': '4.963e-05', 'epoch': '0.37', 'num_input_tokens_seen': 30078618, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8406', 'grad_norm': '1.623', 'learning_rate': '4.963e-05', 'epoch': '0.37', 'num_input_tokens_seen': 30080665, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.472', 'grad_norm': '1.213', 'learning_rate': '4.963e-05', 'epoch': '0.37', 'num_input_tokens_seen': 30082712, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2887', 'grad_norm': '0.9101', 'learning_rate': '4.963e-05', 'epoch': '0.37', 'num_input_tokens_seen': 30084759, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.059', 'grad_norm': '1.809', 'learning_rate': '4.963e-05', 'epoch': '0.3701', 'num_input_tokens_seen': 30086806, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.027', 'grad_norm': '1.882', 'learning_rate': '4.963e-05', 'epoch': '0.3701', 'num_input_tokens_seen': 30088853, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.472', 'grad_norm': '2.71', 'learning_rate': '4.963e-05', 'epoch': '0.3701', 'num_input_tokens_seen': 30090900, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5609', 'grad_norm': '1.085', 'learning_rate': '4.963e-05', 'epoch': '0.3701', 'num_input_tokens_seen': 30092947, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3173', 'grad_norm': '0.8481', 'learning_rate': '4.963e-05', 'epoch': '0.3702', 'num_input_tokens_seen': 30094994, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7481', 'grad_norm': '1.765', 'learning_rate': '4.963e-05', 'epoch': '0.3702', 'num_input_tokens_seen': 30097041, 'train_runtime': '1.523e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6248', 'grad_norm': '1.367', 'learning_rate': '4.963e-05', 'epoch': '0.3702', 'num_input_tokens_seen': 30099088, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3685', 'grad_norm': '1.065', 'learning_rate': '4.963e-05', 'epoch': '0.3702', 'num_input_tokens_seen': 30101135, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7154', 'grad_norm': '1.27', 'learning_rate': '4.963e-05', 'epoch': '0.3703', 'num_input_tokens_seen': 30103182, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4842', 'grad_norm': '1.149', 'learning_rate': '4.963e-05', 'epoch': '0.3703', 'num_input_tokens_seen': 30105229, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5262', 'grad_norm': '1.028', 'learning_rate': '4.963e-05', 'epoch': '0.3703', 'num_input_tokens_seen': 30107276, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7893', 'grad_norm': '1.178', 'learning_rate': '4.963e-05', 'epoch': '0.3703', 'num_input_tokens_seen': 30109323, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9623', 'grad_norm': '1.689', 'learning_rate': '4.963e-05', 'epoch': '0.3704', 'num_input_tokens_seen': 30111370, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.369', 'grad_norm': '1.047', 'learning_rate': '4.963e-05', 'epoch': '0.3704', 'num_input_tokens_seen': 30113417, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.077', 'grad_norm': '2.365', 'learning_rate': '4.963e-05', 'epoch': '0.3704', 'num_input_tokens_seen': 30115464, 'train_runtime': '1.524e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.889', 'grad_norm': '3.404', 'learning_rate': '4.963e-05', 'epoch': '0.3704', 'num_input_tokens_seen': 30117511, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.599', 'grad_norm': '2.873', 'learning_rate': '4.963e-05', 'epoch': '0.3705', 'num_input_tokens_seen': 30119558, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.157', 'grad_norm': '2.03', 'learning_rate': '4.963e-05', 'epoch': '0.3705', 'num_input_tokens_seen': 30121605, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3675', 'grad_norm': '1.037', 'learning_rate': '4.963e-05', 'epoch': '0.3705', 'num_input_tokens_seen': 30123652, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8033', 'grad_norm': '0.9822', 'learning_rate': '4.962e-05', 'epoch': '0.3705', 'num_input_tokens_seen': 30125699, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.039', 'grad_norm': '1.55', 'learning_rate': '4.962e-05', 'epoch': '0.3706', 'num_input_tokens_seen': 30127746, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5202', 'grad_norm': '1.137', 'learning_rate': '4.962e-05', 'epoch': '0.3706', 'num_input_tokens_seen': 30129793, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4578', 'grad_norm': '1.166', 'learning_rate': '4.962e-05', 'epoch': '0.3706', 'num_input_tokens_seen': 30131840, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.244', 'grad_norm': '2.084', 'learning_rate': '4.962e-05', 'epoch': '0.3706', 'num_input_tokens_seen': 30133887, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6383', 'grad_norm': '1.018', 'learning_rate': '4.962e-05', 'epoch': '0.3707', 'num_input_tokens_seen': 30135934, 'train_runtime': '1.525e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9386', 'grad_norm': '1.717', 'learning_rate': '4.962e-05', 'epoch': '0.3707', 'num_input_tokens_seen': 30137981, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2869', 'grad_norm': '0.8749', 'learning_rate': '4.962e-05', 'epoch': '0.3707', 'num_input_tokens_seen': 30140028, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7871', 'grad_norm': '1.21', 'learning_rate': '4.962e-05', 'epoch': '0.3707', 'num_input_tokens_seen': 30142075, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4634', 'grad_norm': '1.148', 'learning_rate': '4.962e-05', 'epoch': '0.3708', 'num_input_tokens_seen': 30144122, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9363', 'grad_norm': '1.874', 'learning_rate': '4.962e-05', 'epoch': '0.3708', 'num_input_tokens_seen': 30146169, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6853', 'grad_norm': '2.444', 'learning_rate': '4.962e-05', 'epoch': '0.3708', 'num_input_tokens_seen': 30148216, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.302', 'grad_norm': '2.485', 'learning_rate': '4.962e-05', 'epoch': '0.3708', 'num_input_tokens_seen': 30150263, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5251', 'grad_norm': '1.062', 'learning_rate': '4.962e-05', 'epoch': '0.3709', 'num_input_tokens_seen': 30152310, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5628', 'grad_norm': '1.492', 'learning_rate': '4.962e-05', 'epoch': '0.3709', 'num_input_tokens_seen': 30154357, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.899', 'grad_norm': '1.862', 'learning_rate': '4.962e-05', 'epoch': '0.3709', 'num_input_tokens_seen': 30156404, 'train_runtime': '1.526e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4319', 'grad_norm': '1.131', 'learning_rate': '4.962e-05', 'epoch': '0.3709', 'num_input_tokens_seen': 30158451, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.329', 'grad_norm': '0.9699', 'learning_rate': '4.962e-05', 'epoch': '0.371', 'num_input_tokens_seen': 30160498, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1956', 'grad_norm': '0.7214', 'learning_rate': '4.962e-05', 'epoch': '0.371', 'num_input_tokens_seen': 30162545, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6105', 'grad_norm': '1.428', 'learning_rate': '4.962e-05', 'epoch': '0.371', 'num_input_tokens_seen': 30164592, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.697', 'grad_norm': '2.488', 'learning_rate': '4.962e-05', 'epoch': '0.3711', 'num_input_tokens_seen': 30166639, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3078', 'grad_norm': '0.8914', 'learning_rate': '4.962e-05', 'epoch': '0.3711', 'num_input_tokens_seen': 30168686, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4086', 'grad_norm': '0.8907', 'learning_rate': '4.962e-05', 'epoch': '0.3711', 'num_input_tokens_seen': 30170733, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3527', 'grad_norm': '0.8323', 'learning_rate': '4.962e-05', 'epoch': '0.3711', 'num_input_tokens_seen': 30172780, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6862', 'grad_norm': '1.519', 'learning_rate': '4.962e-05', 'epoch': '0.3712', 'num_input_tokens_seen': 30174827, 'train_runtime': '1.527e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.648', 'grad_norm': '1.012', 'learning_rate': '4.962e-05', 'epoch': '0.3712', 'num_input_tokens_seen': 30176874, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6398', 'grad_norm': '1.053', 'learning_rate': '4.962e-05', 'epoch': '0.3712', 'num_input_tokens_seen': 30178921, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9386', 'grad_norm': '2.427', 'learning_rate': '4.962e-05', 'epoch': '0.3712', 'num_input_tokens_seen': 30180968, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.114', 'grad_norm': '2.319', 'learning_rate': '4.962e-05', 'epoch': '0.3713', 'num_input_tokens_seen': 30183015, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.631', 'grad_norm': '1.323', 'learning_rate': '4.962e-05', 'epoch': '0.3713', 'num_input_tokens_seen': 30185062, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4757', 'grad_norm': '0.9394', 'learning_rate': '4.962e-05', 'epoch': '0.3713', 'num_input_tokens_seen': 30187109, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2633', 'grad_norm': '0.923', 'learning_rate': '4.962e-05', 'epoch': '0.3713', 'num_input_tokens_seen': 30189156, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5119', 'grad_norm': '1.102', 'learning_rate': '4.962e-05', 'epoch': '0.3714', 'num_input_tokens_seen': 30191203, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.032', 'grad_norm': '1.684', 'learning_rate': '4.962e-05', 'epoch': '0.3714', 'num_input_tokens_seen': 30193250, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4097', 'grad_norm': '1.208', 'learning_rate': '4.962e-05', 'epoch': '0.3714', 'num_input_tokens_seen': 30195297, 'train_runtime': '1.528e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8504', 'grad_norm': '1.313', 'learning_rate': '4.962e-05', 'epoch': '0.3714', 'num_input_tokens_seen': 30197344, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5651', 'grad_norm': '1.194', 'learning_rate': '4.962e-05', 'epoch': '0.3715', 'num_input_tokens_seen': 30199391, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.071', 'grad_norm': '2.346', 'learning_rate': '4.962e-05', 'epoch': '0.3715', 'num_input_tokens_seen': 30201438, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.946', 'grad_norm': '1.805', 'learning_rate': '4.962e-05', 'epoch': '0.3715', 'num_input_tokens_seen': 30203485, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.519', 'grad_norm': '0.9751', 'learning_rate': '4.962e-05', 'epoch': '0.3715', 'num_input_tokens_seen': 30205532, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5921', 'grad_norm': '1.36', 'learning_rate': '4.962e-05', 'epoch': '0.3716', 'num_input_tokens_seen': 30207579, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.214', 'grad_norm': '0.785', 'learning_rate': '4.962e-05', 'epoch': '0.3716', 'num_input_tokens_seen': 30209626, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2948', 'grad_norm': '0.7916', 'learning_rate': '4.962e-05', 'epoch': '0.3716', 'num_input_tokens_seen': 30211673, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6113', 'grad_norm': '1.471', 'learning_rate': '4.962e-05', 'epoch': '0.3716', 'num_input_tokens_seen': 30213720, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2972', 'grad_norm': '0.784', 'learning_rate': '4.962e-05', 'epoch': '0.3717', 'num_input_tokens_seen': 30215767, 'train_runtime': '1.529e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.589', 'grad_norm': '2.448', 'learning_rate': '4.962e-05', 'epoch': '0.3717', 'num_input_tokens_seen': 30217814, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5134', 'grad_norm': '1.258', 'learning_rate': '4.962e-05', 'epoch': '0.3717', 'num_input_tokens_seen': 30219861, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.334', 'grad_norm': '1.074', 'learning_rate': '4.962e-05', 'epoch': '0.3717', 'num_input_tokens_seen': 30221908, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1916', 'grad_norm': '0.8548', 'learning_rate': '4.962e-05', 'epoch': '0.3718', 'num_input_tokens_seen': 30223955, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3955', 'grad_norm': '0.8941', 'learning_rate': '4.962e-05', 'epoch': '0.3718', 'num_input_tokens_seen': 30226002, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2861', 'grad_norm': '0.9239', 'learning_rate': '4.962e-05', 'epoch': '0.3718', 'num_input_tokens_seen': 30228049, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2118', 'grad_norm': '0.9724', 'learning_rate': '4.962e-05', 'epoch': '0.3718', 'num_input_tokens_seen': 30230096, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7607', 'grad_norm': '1.415', 'learning_rate': '4.962e-05', 'epoch': '0.3719', 'num_input_tokens_seen': 30232143, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7067', 'grad_norm': '1.267', 'learning_rate': '4.962e-05', 'epoch': '0.3719', 'num_input_tokens_seen': 30234190, 'train_runtime': '1.53e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3722', 'grad_norm': '1.093', 'learning_rate': '4.962e-05', 'epoch': '0.3719', 'num_input_tokens_seen': 30236237, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5802', 'grad_norm': '1.51', 'learning_rate': '4.962e-05', 'epoch': '0.3719', 'num_input_tokens_seen': 30238284, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8372', 'grad_norm': '1.952', 'learning_rate': '4.962e-05', 'epoch': '0.372', 'num_input_tokens_seen': 30240331, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.678', 'grad_norm': '2.351', 'learning_rate': '4.962e-05', 'epoch': '0.372', 'num_input_tokens_seen': 30242378, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.961', 'grad_norm': '1.589', 'learning_rate': '4.962e-05', 'epoch': '0.372', 'num_input_tokens_seen': 30244425, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2984', 'grad_norm': '0.8319', 'learning_rate': '4.962e-05', 'epoch': '0.372', 'num_input_tokens_seen': 30246472, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2446', 'grad_norm': '0.9249', 'learning_rate': '4.962e-05', 'epoch': '0.3721', 'num_input_tokens_seen': 30248519, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.258', 'grad_norm': '0.8585', 'learning_rate': '4.962e-05', 'epoch': '0.3721', 'num_input_tokens_seen': 30250566, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.209', 'grad_norm': '2.389', 'learning_rate': '4.962e-05', 'epoch': '0.3721', 'num_input_tokens_seen': 30252613, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3489', 'grad_norm': '1.431', 'learning_rate': '4.962e-05', 'epoch': '0.3721', 'num_input_tokens_seen': 30254660, 'train_runtime': '1.531e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6758', 'grad_norm': '1.205', 'learning_rate': '4.962e-05', 'epoch': '0.3722', 'num_input_tokens_seen': 30256707, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5744', 'grad_norm': '1.268', 'learning_rate': '4.962e-05', 'epoch': '0.3722', 'num_input_tokens_seen': 30258754, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7443', 'grad_norm': '0.9987', 'learning_rate': '4.962e-05', 'epoch': '0.3722', 'num_input_tokens_seen': 30260801, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6663', 'grad_norm': '1.371', 'learning_rate': '4.962e-05', 'epoch': '0.3722', 'num_input_tokens_seen': 30262848, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.13', 'grad_norm': '2.152', 'learning_rate': '4.962e-05', 'epoch': '0.3723', 'num_input_tokens_seen': 30264895, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3389', 'grad_norm': '0.8972', 'learning_rate': '4.962e-05', 'epoch': '0.3723', 'num_input_tokens_seen': 30266942, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3118', 'grad_norm': '0.7497', 'learning_rate': '4.962e-05', 'epoch': '0.3723', 'num_input_tokens_seen': 30268989, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2185', 'grad_norm': '0.8673', 'learning_rate': '4.962e-05', 'epoch': '0.3723', 'num_input_tokens_seen': 30271036, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8077', 'grad_norm': '1.482', 'learning_rate': '4.962e-05', 'epoch': '0.3724', 'num_input_tokens_seen': 30273083, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3515', 'grad_norm': '1.175', 'learning_rate': '4.962e-05', 'epoch': '0.3724', 'num_input_tokens_seen': 30275130, 'train_runtime': '1.532e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.386', 'grad_norm': '1.239', 'learning_rate': '4.962e-05', 'epoch': '0.3724', 'num_input_tokens_seen': 30277177, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5991', 'grad_norm': '1.213', 'learning_rate': '4.962e-05', 'epoch': '0.3724', 'num_input_tokens_seen': 30279224, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4132', 'grad_norm': '1.1', 'learning_rate': '4.962e-05', 'epoch': '0.3725', 'num_input_tokens_seen': 30281271, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.461', 'grad_norm': '2.459', 'learning_rate': '4.962e-05', 'epoch': '0.3725', 'num_input_tokens_seen': 30283318, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9598', 'grad_norm': '1.592', 'learning_rate': '4.962e-05', 'epoch': '0.3725', 'num_input_tokens_seen': 30285365, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4874', 'grad_norm': '1.204', 'learning_rate': '4.962e-05', 'epoch': '0.3725', 'num_input_tokens_seen': 30287412, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6838', 'grad_norm': '1.243', 'learning_rate': '4.962e-05', 'epoch': '0.3726', 'num_input_tokens_seen': 30289459, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.159', 'grad_norm': '1.922', 'learning_rate': '4.962e-05', 'epoch': '0.3726', 'num_input_tokens_seen': 30291506, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3917', 'grad_norm': '1.025', 'learning_rate': '4.962e-05', 'epoch': '0.3726', 'num_input_tokens_seen': 30293553, 'train_runtime': '1.533e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2592', 'grad_norm': '0.8862', 'learning_rate': '4.962e-05', 'epoch': '0.3726', 'num_input_tokens_seen': 30295600, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9043', 'grad_norm': '1.348', 'learning_rate': '4.962e-05', 'epoch': '0.3727', 'num_input_tokens_seen': 30297647, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2947', 'grad_norm': '0.9493', 'learning_rate': '4.962e-05', 'epoch': '0.3727', 'num_input_tokens_seen': 30299694, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.571', 'grad_norm': '4.747', 'learning_rate': '4.962e-05', 'epoch': '0.3727', 'num_input_tokens_seen': 30301741, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5873', 'grad_norm': '1.571', 'learning_rate': '4.962e-05', 'epoch': '0.3727', 'num_input_tokens_seen': 30303788, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.609', 'grad_norm': '2.783', 'learning_rate': '4.962e-05', 'epoch': '0.3728', 'num_input_tokens_seen': 30305835, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.025', 'grad_norm': '1.887', 'learning_rate': '4.962e-05', 'epoch': '0.3728', 'num_input_tokens_seen': 30307882, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.374', 'grad_norm': '0.9923', 'learning_rate': '4.962e-05', 'epoch': '0.3728', 'num_input_tokens_seen': 30309929, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6977', 'grad_norm': '1.324', 'learning_rate': '4.962e-05', 'epoch': '0.3728', 'num_input_tokens_seen': 30311976, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3187', 'grad_norm': '1.053', 'learning_rate': '4.962e-05', 'epoch': '0.3729', 'num_input_tokens_seen': 30314023, 'train_runtime': '1.534e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7379', 'grad_norm': '2.08', 'learning_rate': '4.962e-05', 'epoch': '0.3729', 'num_input_tokens_seen': 30316070, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6894', 'grad_norm': '1.735', 'learning_rate': '4.962e-05', 'epoch': '0.3729', 'num_input_tokens_seen': 30318117, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.402', 'grad_norm': '1.036', 'learning_rate': '4.962e-05', 'epoch': '0.3729', 'num_input_tokens_seen': 30320164, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.711', 'grad_norm': '1.173', 'learning_rate': '4.962e-05', 'epoch': '0.373', 'num_input_tokens_seen': 30322211, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.725', 'grad_norm': '2.663', 'learning_rate': '4.962e-05', 'epoch': '0.373', 'num_input_tokens_seen': 30324258, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.738', 'grad_norm': '1.748', 'learning_rate': '4.962e-05', 'epoch': '0.373', 'num_input_tokens_seen': 30326305, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9935', 'grad_norm': '1.741', 'learning_rate': '4.962e-05', 'epoch': '0.373', 'num_input_tokens_seen': 30328352, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4664', 'grad_norm': '1.176', 'learning_rate': '4.962e-05', 'epoch': '0.3731', 'num_input_tokens_seen': 30330399, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7999', 'grad_norm': '1.191', 'learning_rate': '4.962e-05', 'epoch': '0.3731', 'num_input_tokens_seen': 30332446, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8627', 'grad_norm': '1.545', 'learning_rate': '4.962e-05', 'epoch': '0.3731', 'num_input_tokens_seen': 30334493, 'train_runtime': '1.535e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7893', 'grad_norm': '1.528', 'learning_rate': '4.962e-05', 'epoch': '0.3731', 'num_input_tokens_seen': 30336540, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.137', 'grad_norm': '1.989', 'learning_rate': '4.962e-05', 'epoch': '0.3732', 'num_input_tokens_seen': 30338587, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7568', 'grad_norm': '1.377', 'learning_rate': '4.962e-05', 'epoch': '0.3732', 'num_input_tokens_seen': 30340634, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.072', 'grad_norm': '2.181', 'learning_rate': '4.962e-05', 'epoch': '0.3732', 'num_input_tokens_seen': 30342681, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7971', 'grad_norm': '1.54', 'learning_rate': '4.962e-05', 'epoch': '0.3732', 'num_input_tokens_seen': 30344728, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5982', 'grad_norm': '1.124', 'learning_rate': '4.962e-05', 'epoch': '0.3733', 'num_input_tokens_seen': 30346775, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5677', 'grad_norm': '1.385', 'learning_rate': '4.962e-05', 'epoch': '0.3733', 'num_input_tokens_seen': 30348822, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6106', 'grad_norm': '1.168', 'learning_rate': '4.962e-05', 'epoch': '0.3733', 'num_input_tokens_seen': 30350869, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4039', 'grad_norm': '1.181', 'learning_rate': '4.962e-05', 'epoch': '0.3733', 'num_input_tokens_seen': 30352916, 'train_runtime': '1.536e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.567', 'grad_norm': '1.293', 'learning_rate': '4.962e-05', 'epoch': '0.3734', 'num_input_tokens_seen': 30354963, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8224', 'grad_norm': '1.379', 'learning_rate': '4.962e-05', 'epoch': '0.3734', 'num_input_tokens_seen': 30357010, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.112', 'grad_norm': '1.813', 'learning_rate': '4.962e-05', 'epoch': '0.3734', 'num_input_tokens_seen': 30359057, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6327', 'grad_norm': '1.382', 'learning_rate': '4.962e-05', 'epoch': '0.3734', 'num_input_tokens_seen': 30361104, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8818', 'grad_norm': '1.617', 'learning_rate': '4.962e-05', 'epoch': '0.3735', 'num_input_tokens_seen': 30363151, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.224', 'grad_norm': '2.422', 'learning_rate': '4.962e-05', 'epoch': '0.3735', 'num_input_tokens_seen': 30365198, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6417', 'grad_norm': '1.4', 'learning_rate': '4.962e-05', 'epoch': '0.3735', 'num_input_tokens_seen': 30367245, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.21', 'grad_norm': '2.368', 'learning_rate': '4.962e-05', 'epoch': '0.3735', 'num_input_tokens_seen': 30369292, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3329', 'grad_norm': '0.9207', 'learning_rate': '4.962e-05', 'epoch': '0.3736', 'num_input_tokens_seen': 30371339, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8555', 'grad_norm': '1.223', 'learning_rate': '4.962e-05', 'epoch': '0.3736', 'num_input_tokens_seen': 30373386, 'train_runtime': '1.537e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3088', 'grad_norm': '0.737', 'learning_rate': '4.962e-05', 'epoch': '0.3736', 'num_input_tokens_seen': 30375433, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2737', 'grad_norm': '0.925', 'learning_rate': '4.962e-05', 'epoch': '0.3736', 'num_input_tokens_seen': 30377480, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.406', 'grad_norm': '2.915', 'learning_rate': '4.962e-05', 'epoch': '0.3737', 'num_input_tokens_seen': 30379527, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.276', 'grad_norm': '2.253', 'learning_rate': '4.962e-05', 'epoch': '0.3737', 'num_input_tokens_seen': 30381574, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.086', 'grad_norm': '1.925', 'learning_rate': '4.962e-05', 'epoch': '0.3737', 'num_input_tokens_seen': 30383621, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5316', 'grad_norm': '1.003', 'learning_rate': '4.962e-05', 'epoch': '0.3737', 'num_input_tokens_seen': 30385668, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8826', 'grad_norm': '1.405', 'learning_rate': '4.962e-05', 'epoch': '0.3738', 'num_input_tokens_seen': 30387715, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6319', 'grad_norm': '1.198', 'learning_rate': '4.962e-05', 'epoch': '0.3738', 'num_input_tokens_seen': 30389762, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4566', 'grad_norm': '1.252', 'learning_rate': '4.962e-05', 'epoch': '0.3738', 'num_input_tokens_seen': 30391809, 'train_runtime': '1.538e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8137', 'grad_norm': '1.712', 'learning_rate': '4.962e-05', 'epoch': '0.3738', 'num_input_tokens_seen': 30393856, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6877', 'grad_norm': '1.303', 'learning_rate': '4.962e-05', 'epoch': '0.3739', 'num_input_tokens_seen': 30395903, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4879', 'grad_norm': '0.9328', 'learning_rate': '4.962e-05', 'epoch': '0.3739', 'num_input_tokens_seen': 30397950, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6042', 'grad_norm': '1.698', 'learning_rate': '4.962e-05', 'epoch': '0.3739', 'num_input_tokens_seen': 30399997, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3022', 'grad_norm': '0.9104', 'learning_rate': '4.962e-05', 'epoch': '0.3739', 'num_input_tokens_seen': 30402044, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7019', 'grad_norm': '1.152', 'learning_rate': '4.962e-05', 'epoch': '0.374', 'num_input_tokens_seen': 30404091, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2457', 'grad_norm': '0.9425', 'learning_rate': '4.962e-05', 'epoch': '0.374', 'num_input_tokens_seen': 30406138, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9933', 'grad_norm': '1.453', 'learning_rate': '4.962e-05', 'epoch': '0.374', 'num_input_tokens_seen': 30408185, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6129', 'grad_norm': '1.529', 'learning_rate': '4.962e-05', 'epoch': '0.374', 'num_input_tokens_seen': 30410232, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.388', 'grad_norm': '1.919', 'learning_rate': '4.962e-05', 'epoch': '0.3741', 'num_input_tokens_seen': 30412279, 'train_runtime': '1.539e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6549', 'grad_norm': '1.131', 'learning_rate': '4.962e-05', 'epoch': '0.3741', 'num_input_tokens_seen': 30414326, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2178', 'grad_norm': '0.9714', 'learning_rate': '4.961e-05', 'epoch': '0.3741', 'num_input_tokens_seen': 30416373, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4032', 'grad_norm': '1.206', 'learning_rate': '4.961e-05', 'epoch': '0.3741', 'num_input_tokens_seen': 30418420, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4462', 'grad_norm': '1.042', 'learning_rate': '4.961e-05', 'epoch': '0.3742', 'num_input_tokens_seen': 30420467, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4562', 'grad_norm': '1.089', 'learning_rate': '4.961e-05', 'epoch': '0.3742', 'num_input_tokens_seen': 30422514, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5797', 'grad_norm': '1.218', 'learning_rate': '4.961e-05', 'epoch': '0.3742', 'num_input_tokens_seen': 30424561, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9051', 'grad_norm': '1.496', 'learning_rate': '4.961e-05', 'epoch': '0.3742', 'num_input_tokens_seen': 30426608, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2017', 'grad_norm': '0.8371', 'learning_rate': '4.961e-05', 'epoch': '0.3743', 'num_input_tokens_seen': 30428655, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3532', 'grad_norm': '0.8244', 'learning_rate': '4.961e-05', 'epoch': '0.3743', 'num_input_tokens_seen': 30430702, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6087', 'grad_norm': '1.253', 'learning_rate': '4.961e-05', 'epoch': '0.3743', 'num_input_tokens_seen': 30432749, 'train_runtime': '1.54e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2498', 'grad_norm': '0.8254', 'learning_rate': '4.961e-05', 'epoch': '0.3743', 'num_input_tokens_seen': 30434796, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7171', 'grad_norm': '1.483', 'learning_rate': '4.961e-05', 'epoch': '0.3744', 'num_input_tokens_seen': 30436843, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.086', 'grad_norm': '2.018', 'learning_rate': '4.961e-05', 'epoch': '0.3744', 'num_input_tokens_seen': 30438890, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4353', 'grad_norm': '0.9881', 'learning_rate': '4.961e-05', 'epoch': '0.3744', 'num_input_tokens_seen': 30440937, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7308', 'grad_norm': '1.601', 'learning_rate': '4.961e-05', 'epoch': '0.3744', 'num_input_tokens_seen': 30442984, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.15', 'grad_norm': '1.894', 'learning_rate': '4.961e-05', 'epoch': '0.3745', 'num_input_tokens_seen': 30445031, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6532', 'grad_norm': '1.089', 'learning_rate': '4.961e-05', 'epoch': '0.3745', 'num_input_tokens_seen': 30447078, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.477', 'grad_norm': '1.079', 'learning_rate': '4.961e-05', 'epoch': '0.3745', 'num_input_tokens_seen': 30449125, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8125', 'grad_norm': '1.275', 'learning_rate': '4.961e-05', 'epoch': '0.3745', 'num_input_tokens_seen': 30451172, 'train_runtime': '1.541e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2472', 'grad_norm': '0.905', 'learning_rate': '4.961e-05', 'epoch': '0.3746', 'num_input_tokens_seen': 30453219, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.031', 'grad_norm': '1.679', 'learning_rate': '4.961e-05', 'epoch': '0.3746', 'num_input_tokens_seen': 30455266, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.479', 'grad_norm': '1.974', 'learning_rate': '4.961e-05', 'epoch': '0.3746', 'num_input_tokens_seen': 30457313, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.742', 'grad_norm': '1.41', 'learning_rate': '4.961e-05', 'epoch': '0.3747', 'num_input_tokens_seen': 30459360, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3509', 'grad_norm': '0.9441', 'learning_rate': '4.961e-05', 'epoch': '0.3747', 'num_input_tokens_seen': 30461407, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7114', 'grad_norm': '1.856', 'learning_rate': '4.961e-05', 'epoch': '0.3747', 'num_input_tokens_seen': 30463454, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6207', 'grad_norm': '1.171', 'learning_rate': '4.961e-05', 'epoch': '0.3747', 'num_input_tokens_seen': 30465501, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6985', 'grad_norm': '1.938', 'learning_rate': '4.961e-05', 'epoch': '0.3748', 'num_input_tokens_seen': 30467548, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.007', 'grad_norm': '2.671', 'learning_rate': '4.961e-05', 'epoch': '0.3748', 'num_input_tokens_seen': 30469595, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6836', 'grad_norm': '1.612', 'learning_rate': '4.961e-05', 'epoch': '0.3748', 'num_input_tokens_seen': 30471642, 'train_runtime': '1.542e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5361', 'grad_norm': '1.328', 'learning_rate': '4.961e-05', 'epoch': '0.3748', 'num_input_tokens_seen': 30473689, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2597', 'grad_norm': '0.8403', 'learning_rate': '4.961e-05', 'epoch': '0.3749', 'num_input_tokens_seen': 30475736, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2853', 'grad_norm': '0.7514', 'learning_rate': '4.961e-05', 'epoch': '0.3749', 'num_input_tokens_seen': 30477783, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2956', 'grad_norm': '0.8779', 'learning_rate': '4.961e-05', 'epoch': '0.3749', 'num_input_tokens_seen': 30479830, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9331', 'grad_norm': '1.748', 'learning_rate': '4.961e-05', 'epoch': '0.3749', 'num_input_tokens_seen': 30481877, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.34', 'grad_norm': '2.338', 'learning_rate': '4.961e-05', 'epoch': '0.375', 'num_input_tokens_seen': 30483924, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4389', 'grad_norm': '1.167', 'learning_rate': '4.961e-05', 'epoch': '0.375', 'num_input_tokens_seen': 30485971, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.165', 'grad_norm': '3.329', 'learning_rate': '4.961e-05', 'epoch': '0.375', 'num_input_tokens_seen': 30488018, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3838', 'grad_norm': '1.139', 'learning_rate': '4.961e-05', 'epoch': '0.375', 'num_input_tokens_seen': 30490065, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4368', 'grad_norm': '1.215', 'learning_rate': '4.961e-05', 'epoch': '0.3751', 'num_input_tokens_seen': 30492112, 'train_runtime': '1.543e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4887', 'grad_norm': '1.191', 'learning_rate': '4.961e-05', 'epoch': '0.3751', 'num_input_tokens_seen': 30494159, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9238', 'grad_norm': '1.411', 'learning_rate': '4.961e-05', 'epoch': '0.3751', 'num_input_tokens_seen': 30496206, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4066', 'grad_norm': '0.9224', 'learning_rate': '4.961e-05', 'epoch': '0.3751', 'num_input_tokens_seen': 30498253, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2999', 'grad_norm': '0.9494', 'learning_rate': '4.961e-05', 'epoch': '0.3752', 'num_input_tokens_seen': 30500300, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.1345', 'grad_norm': '0.7112', 'learning_rate': '4.961e-05', 'epoch': '0.3752', 'num_input_tokens_seen': 30502347, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.632', 'grad_norm': '2.925', 'learning_rate': '4.961e-05', 'epoch': '0.3752', 'num_input_tokens_seen': 30504394, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6003', 'grad_norm': '1.229', 'learning_rate': '4.961e-05', 'epoch': '0.3752', 'num_input_tokens_seen': 30506441, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4727', 'grad_norm': '1.333', 'learning_rate': '4.961e-05', 'epoch': '0.3753', 'num_input_tokens_seen': 30508488, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2141', 'grad_norm': '0.7714', 'learning_rate': '4.961e-05', 'epoch': '0.3753', 'num_input_tokens_seen': 30510535, 'train_runtime': '1.544e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4136', 'grad_norm': '1.148', 'learning_rate': '4.961e-05', 'epoch': '0.3753', 'num_input_tokens_seen': 30512582, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4802', 'grad_norm': '1.36', 'learning_rate': '4.961e-05', 'epoch': '0.3753', 'num_input_tokens_seen': 30514629, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.623', 'grad_norm': '2.453', 'learning_rate': '4.961e-05', 'epoch': '0.3754', 'num_input_tokens_seen': 30516676, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3544', 'grad_norm': '0.8684', 'learning_rate': '4.961e-05', 'epoch': '0.3754', 'num_input_tokens_seen': 30518723, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3198', 'grad_norm': '0.8853', 'learning_rate': '4.961e-05', 'epoch': '0.3754', 'num_input_tokens_seen': 30520770, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.217', 'grad_norm': '0.7681', 'learning_rate': '4.961e-05', 'epoch': '0.3754', 'num_input_tokens_seen': 30522817, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4533', 'grad_norm': '0.9747', 'learning_rate': '4.961e-05', 'epoch': '0.3755', 'num_input_tokens_seen': 30524864, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.93', 'grad_norm': '3.065', 'learning_rate': '4.961e-05', 'epoch': '0.3755', 'num_input_tokens_seen': 30526911, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7474', 'grad_norm': '1.362', 'learning_rate': '4.961e-05', 'epoch': '0.3755', 'num_input_tokens_seen': 30528958, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7829', 'grad_norm': '2.099', 'learning_rate': '4.961e-05', 'epoch': '0.3755', 'num_input_tokens_seen': 30531005, 'train_runtime': '1.545e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3356', 'grad_norm': '1.158', 'learning_rate': '4.961e-05', 'epoch': '0.3756', 'num_input_tokens_seen': 30533052, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8149', 'grad_norm': '1.465', 'learning_rate': '4.961e-05', 'epoch': '0.3756', 'num_input_tokens_seen': 30535099, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.327', 'grad_norm': '0.8546', 'learning_rate': '4.961e-05', 'epoch': '0.3756', 'num_input_tokens_seen': 30537146, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5612', 'grad_norm': '1.053', 'learning_rate': '4.961e-05', 'epoch': '0.3756', 'num_input_tokens_seen': 30539193, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3422', 'grad_norm': '0.962', 'learning_rate': '4.961e-05', 'epoch': '0.3757', 'num_input_tokens_seen': 30541240, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8893', 'grad_norm': '2.038', 'learning_rate': '4.961e-05', 'epoch': '0.3757', 'num_input_tokens_seen': 30543287, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2804', 'grad_norm': '0.8579', 'learning_rate': '4.961e-05', 'epoch': '0.3757', 'num_input_tokens_seen': 30545334, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3771', 'grad_norm': '0.9484', 'learning_rate': '4.961e-05', 'epoch': '0.3757', 'num_input_tokens_seen': 30547381, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8875', 'grad_norm': '1.385', 'learning_rate': '4.961e-05', 'epoch': '0.3758', 'num_input_tokens_seen': 30549428, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4096', 'grad_norm': '1.323', 'learning_rate': '4.961e-05', 'epoch': '0.3758', 'num_input_tokens_seen': 30551475, 'train_runtime': '1.546e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2723', 'grad_norm': '0.9128', 'learning_rate': '4.961e-05', 'epoch': '0.3758', 'num_input_tokens_seen': 30553522, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3084', 'grad_norm': '0.8918', 'learning_rate': '4.961e-05', 'epoch': '0.3758', 'num_input_tokens_seen': 30555569, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.73', 'grad_norm': '2.449', 'learning_rate': '4.961e-05', 'epoch': '0.3759', 'num_input_tokens_seen': 30557616, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.116', 'grad_norm': '1.163', 'learning_rate': '4.961e-05', 'epoch': '0.3759', 'num_input_tokens_seen': 30559663, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.165', 'grad_norm': '2.172', 'learning_rate': '4.961e-05', 'epoch': '0.3759', 'num_input_tokens_seen': 30561710, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3616', 'grad_norm': '0.8254', 'learning_rate': '4.961e-05', 'epoch': '0.3759', 'num_input_tokens_seen': 30563757, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.17', 'grad_norm': '1.862', 'learning_rate': '4.961e-05', 'epoch': '0.376', 'num_input_tokens_seen': 30565804, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.302', 'grad_norm': '2.14', 'learning_rate': '4.961e-05', 'epoch': '0.376', 'num_input_tokens_seen': 30567851, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2014', 'grad_norm': '0.7148', 'learning_rate': '4.961e-05', 'epoch': '0.376', 'num_input_tokens_seen': 30569898, 'train_runtime': '1.547e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9845', 'grad_norm': '1.428', 'learning_rate': '4.961e-05', 'epoch': '0.376', 'num_input_tokens_seen': 30571945, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.8247', 'grad_norm': '1.522', 'learning_rate': '4.961e-05', 'epoch': '0.3761', 'num_input_tokens_seen': 30573992, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.46', 'grad_norm': '2.823', 'learning_rate': '4.961e-05', 'epoch': '0.3761', 'num_input_tokens_seen': 30576039, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2419', 'grad_norm': '0.7978', 'learning_rate': '4.961e-05', 'epoch': '0.3761', 'num_input_tokens_seen': 30578086, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.19', 'grad_norm': '3.466', 'learning_rate': '4.961e-05', 'epoch': '0.3761', 'num_input_tokens_seen': 30580133, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2854', 'grad_norm': '0.8605', 'learning_rate': '4.961e-05', 'epoch': '0.3762', 'num_input_tokens_seen': 30582180, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.61', 'grad_norm': '1.351', 'learning_rate': '4.961e-05', 'epoch': '0.3762', 'num_input_tokens_seen': 30584227, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5551', 'grad_norm': '1.331', 'learning_rate': '4.961e-05', 'epoch': '0.3762', 'num_input_tokens_seen': 30586274, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7819', 'grad_norm': '1.246', 'learning_rate': '4.961e-05', 'epoch': '0.3762', 'num_input_tokens_seen': 30588321, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5389', 'grad_norm': '1.246', 'learning_rate': '4.961e-05', 'epoch': '0.3763', 'num_input_tokens_seen': 30590368, 'train_runtime': '1.548e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4608', 'grad_norm': '1.206', 'learning_rate': '4.961e-05', 'epoch': '0.3763', 'num_input_tokens_seen': 30592415, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.345', 'grad_norm': '0.7143', 'learning_rate': '4.961e-05', 'epoch': '0.3763', 'num_input_tokens_seen': 30594462, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6685', 'grad_norm': '1.188', 'learning_rate': '4.961e-05', 'epoch': '0.3763', 'num_input_tokens_seen': 30596509, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4574', 'grad_norm': '1.269', 'learning_rate': '4.961e-05', 'epoch': '0.3764', 'num_input_tokens_seen': 30598556, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3264', 'grad_norm': '0.8738', 'learning_rate': '4.961e-05', 'epoch': '0.3764', 'num_input_tokens_seen': 30600603, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6874', 'grad_norm': '1.322', 'learning_rate': '4.961e-05', 'epoch': '0.3764', 'num_input_tokens_seen': 30602650, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6125', 'grad_norm': '1.247', 'learning_rate': '4.961e-05', 'epoch': '0.3764', 'num_input_tokens_seen': 30604697, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6992', 'grad_norm': '1.274', 'learning_rate': '4.961e-05', 'epoch': '0.3765', 'num_input_tokens_seen': 30606744, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5253', 'grad_norm': '1.168', 'learning_rate': '4.961e-05', 'epoch': '0.3765', 'num_input_tokens_seen': 30608791, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6611', 'grad_norm': '1.287', 'learning_rate': '4.961e-05', 'epoch': '0.3765', 'num_input_tokens_seen': 30610838, 'train_runtime': '1.549e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.099', 'grad_norm': '1.928', 'learning_rate': '4.961e-05', 'epoch': '0.3765', 'num_input_tokens_seen': 30612885, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7159', 'grad_norm': '1.607', 'learning_rate': '4.961e-05', 'epoch': '0.3766', 'num_input_tokens_seen': 30614932, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.241', 'grad_norm': '0.7773', 'learning_rate': '4.961e-05', 'epoch': '0.3766', 'num_input_tokens_seen': 30616979, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5726', 'grad_norm': '1.181', 'learning_rate': '4.961e-05', 'epoch': '0.3766', 'num_input_tokens_seen': 30619026, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7336', 'grad_norm': '1.234', 'learning_rate': '4.961e-05', 'epoch': '0.3766', 'num_input_tokens_seen': 30621073, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4313', 'grad_norm': '1.034', 'learning_rate': '4.961e-05', 'epoch': '0.3767', 'num_input_tokens_seen': 30623120, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.146', 'grad_norm': '1.96', 'learning_rate': '4.961e-05', 'epoch': '0.3767', 'num_input_tokens_seen': 30625167, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4334', 'grad_norm': '1.001', 'learning_rate': '4.961e-05', 'epoch': '0.3767', 'num_input_tokens_seen': 30627214, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4364', 'grad_norm': '1.164', 'learning_rate': '4.961e-05', 'epoch': '0.3767', 'num_input_tokens_seen': 30629261, 'train_runtime': '1.55e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9853', 'grad_norm': '1.629', 'learning_rate': '4.961e-05', 'epoch': '0.3768', 'num_input_tokens_seen': 30631308, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2093', 'grad_norm': '0.851', 'learning_rate': '4.961e-05', 'epoch': '0.3768', 'num_input_tokens_seen': 30633355, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3305', 'grad_norm': '0.8369', 'learning_rate': '4.961e-05', 'epoch': '0.3768', 'num_input_tokens_seen': 30635402, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4059', 'grad_norm': '1.169', 'learning_rate': '4.961e-05', 'epoch': '0.3768', 'num_input_tokens_seen': 30637449, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.143', 'grad_norm': '2.249', 'learning_rate': '4.961e-05', 'epoch': '0.3769', 'num_input_tokens_seen': 30639496, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3534', 'grad_norm': '1.009', 'learning_rate': '4.961e-05', 'epoch': '0.3769', 'num_input_tokens_seen': 30641543, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5612', 'grad_norm': '1.515', 'learning_rate': '4.961e-05', 'epoch': '0.3769', 'num_input_tokens_seen': 30643590, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.938', 'grad_norm': '1.888', 'learning_rate': '4.961e-05', 'epoch': '0.3769', 'num_input_tokens_seen': 30645637, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7766', 'grad_norm': '2.105', 'learning_rate': '4.961e-05', 'epoch': '0.377', 'num_input_tokens_seen': 30647684, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9696', 'grad_norm': '1.34', 'learning_rate': '4.961e-05', 'epoch': '0.377', 'num_input_tokens_seen': 30649731, 'train_runtime': '1.551e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7807', 'grad_norm': '1.114', 'learning_rate': '4.961e-05', 'epoch': '0.377', 'num_input_tokens_seen': 30651778, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4967', 'grad_norm': '1.189', 'learning_rate': '4.961e-05', 'epoch': '0.377', 'num_input_tokens_seen': 30653825, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.513', 'grad_norm': '2.48', 'learning_rate': '4.961e-05', 'epoch': '0.3771', 'num_input_tokens_seen': 30655872, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4244', 'grad_norm': '0.9701', 'learning_rate': '4.961e-05', 'epoch': '0.3771', 'num_input_tokens_seen': 30657919, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.9252', 'grad_norm': '1.956', 'learning_rate': '4.961e-05', 'epoch': '0.3771', 'num_input_tokens_seen': 30659966, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2618', 'grad_norm': '0.8562', 'learning_rate': '4.961e-05', 'epoch': '0.3771', 'num_input_tokens_seen': 30662013, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2562', 'grad_norm': '0.816', 'learning_rate': '4.961e-05', 'epoch': '0.3772', 'num_input_tokens_seen': 30664060, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.2808', 'grad_norm': '0.953', 'learning_rate': '4.961e-05', 'epoch': '0.3772', 'num_input_tokens_seen': 30666107, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6541', 'grad_norm': '1.293', 'learning_rate': '4.961e-05', 'epoch': '0.3772', 'num_input_tokens_seen': 30668154, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.4546', 'grad_norm': '0.935', 'learning_rate': '4.961e-05', 'epoch': '0.3772', 'num_input_tokens_seen': 30670201, 'train_runtime': '1.552e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.7454', 'grad_norm': '1.124', 'learning_rate': '4.961e-05', 'epoch': '0.3773', 'num_input_tokens_seen': 30672248, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '2.011', 'grad_norm': '2.32', 'learning_rate': '4.961e-05', 'epoch': '0.3773', 'num_input_tokens_seen': 30674295, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3992', 'grad_norm': '0.7933', 'learning_rate': '4.961e-05', 'epoch': '0.3773', 'num_input_tokens_seen': 30676342, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.248', 'grad_norm': '0.808', 'learning_rate': '4.961e-05', 'epoch': '0.3773', 'num_input_tokens_seen': 30678389, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6797', 'grad_norm': '1.519', 'learning_rate': '4.961e-05', 'epoch': '0.3774', 'num_input_tokens_seen': 30680436, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3598', 'grad_norm': '0.8964', 'learning_rate': '4.961e-05', 'epoch': '0.3774', 'num_input_tokens_seen': 30682483, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.424', 'grad_norm': '1.79', 'learning_rate': '4.961e-05', 'epoch': '0.3774', 'num_input_tokens_seen': 30684530, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.824', 'grad_norm': '1.303', 'learning_rate': '4.961e-05', 'epoch': '0.3774', 'num_input_tokens_seen': 30686577, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6146', 'grad_norm': '1.51', 'learning_rate': '4.961e-05', 'epoch': '0.3775', 'num_input_tokens_seen': 30688624, 'train_runtime': '1.553e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.256', 'grad_norm': '1.83', 'learning_rate': '4.961e-05', 'epoch': '0.3775', 'num_input_tokens_seen': 30690671, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.128', 'grad_norm': '2.099', 'learning_rate': '4.961e-05', 'epoch': '0.3775', 'num_input_tokens_seen': 30692718, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.5614', 'grad_norm': '1.596', 'learning_rate': '4.961e-05', 'epoch': '0.3775', 'num_input_tokens_seen': 30694765, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.176', 'grad_norm': '1.62', 'learning_rate': '4.961e-05', 'epoch': '0.3776', 'num_input_tokens_seen': 30696812, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.028', 'grad_norm': '1.922', 'learning_rate': '4.961e-05', 'epoch': '0.3776', 'num_input_tokens_seen': 30698859, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.6906', 'grad_norm': '1.384', 'learning_rate': '4.961e-05', 'epoch': '0.3776', 'num_input_tokens_seen': 30700906, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '1.378', 'grad_norm': '2.214', 'learning_rate': '4.961e-05', 'epoch': '0.3776', 'num_input_tokens_seen': 30702953, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +{'loss': '0.3169', 'grad_norm': '0.948', 'learning_rate': '4.96e-05', 'epoch': '0.3777', 'num_input_tokens_seen': 30705000, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1976'} +[INFO|configuration_utils.py:665] 2026-02-05 06:56:28,080 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json +[INFO|configuration_utils.py:739] 2026-02-05 06:56:28,080 >> Model config Qwen3Config { + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151643, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 32768, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": false, + "transformers_version": "5.0.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:3327] 2026-02-05 06:56:28,575 >> chat template saved in /workspace/v127rc_exp1/D_mul/checkpoint-15000/chat_template.jinja +[INFO|tokenization_utils_base.py:2181] 2026-02-05 06:56:28,584 >> tokenizer config file saved in /workspace/v127rc_exp1/D_mul/checkpoint-15000/tokenizer_config.json + +{'loss': '0.6014', 'grad_norm': '1.253', 'learning_rate': '4.96e-05', 'epoch': '0.3777', 'num_input_tokens_seen': 30707047, 'train_runtime': '1.554e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.252', 'grad_norm': '2.218', 'learning_rate': '4.96e-05', 'epoch': '0.3777', 'num_input_tokens_seen': 30709094, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6425', 'grad_norm': '1.268', 'learning_rate': '4.96e-05', 'epoch': '0.3777', 'num_input_tokens_seen': 30711141, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5893', 'grad_norm': '1.413', 'learning_rate': '4.96e-05', 'epoch': '0.3778', 'num_input_tokens_seen': 30713188, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7906', 'grad_norm': '1.804', 'learning_rate': '4.96e-05', 'epoch': '0.3778', 'num_input_tokens_seen': 30715235, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8995', 'grad_norm': '1.974', 'learning_rate': '4.96e-05', 'epoch': '0.3778', 'num_input_tokens_seen': 30717282, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5427', 'grad_norm': '1.306', 'learning_rate': '4.96e-05', 'epoch': '0.3778', 'num_input_tokens_seen': 30719329, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6448', 'grad_norm': '1.126', 'learning_rate': '4.96e-05', 'epoch': '0.3779', 'num_input_tokens_seen': 30721376, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5258', 'grad_norm': '1.072', 'learning_rate': '4.96e-05', 'epoch': '0.3779', 'num_input_tokens_seen': 30723423, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.67', 'grad_norm': '1.472', 'learning_rate': '4.96e-05', 'epoch': '0.3779', 'num_input_tokens_seen': 30725470, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6175', 'grad_norm': '1.797', 'learning_rate': '4.96e-05', 'epoch': '0.3779', 'num_input_tokens_seen': 30727517, 'train_runtime': '1.555e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1421', 'grad_norm': '0.756', 'learning_rate': '4.96e-05', 'epoch': '0.378', 'num_input_tokens_seen': 30729564, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7582', 'grad_norm': '1.25', 'learning_rate': '4.96e-05', 'epoch': '0.378', 'num_input_tokens_seen': 30731611, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.077', 'grad_norm': '1.661', 'learning_rate': '4.96e-05', 'epoch': '0.378', 'num_input_tokens_seen': 30733658, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.873', 'grad_norm': '3.132', 'learning_rate': '4.96e-05', 'epoch': '0.378', 'num_input_tokens_seen': 30735705, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8306', 'grad_norm': '1.414', 'learning_rate': '4.96e-05', 'epoch': '0.3781', 'num_input_tokens_seen': 30737752, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.473', 'grad_norm': '1.434', 'learning_rate': '4.96e-05', 'epoch': '0.3781', 'num_input_tokens_seen': 30739799, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9867', 'grad_norm': '1.865', 'learning_rate': '4.96e-05', 'epoch': '0.3781', 'num_input_tokens_seen': 30741846, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.212', 'grad_norm': '1.873', 'learning_rate': '4.96e-05', 'epoch': '0.3782', 'num_input_tokens_seen': 30743893, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8801', 'grad_norm': '1.91', 'learning_rate': '4.96e-05', 'epoch': '0.3782', 'num_input_tokens_seen': 30745940, 'train_runtime': '1.556e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3967', 'grad_norm': '0.9345', 'learning_rate': '4.96e-05', 'epoch': '0.3782', 'num_input_tokens_seen': 30747987, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4362', 'grad_norm': '1.209', 'learning_rate': '4.96e-05', 'epoch': '0.3782', 'num_input_tokens_seen': 30750034, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.948', 'grad_norm': '1.413', 'learning_rate': '4.96e-05', 'epoch': '0.3783', 'num_input_tokens_seen': 30752081, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5567', 'grad_norm': '1.007', 'learning_rate': '4.96e-05', 'epoch': '0.3783', 'num_input_tokens_seen': 30754128, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2631', 'grad_norm': '1.01', 'learning_rate': '4.96e-05', 'epoch': '0.3783', 'num_input_tokens_seen': 30756175, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1328', 'grad_norm': '0.6596', 'learning_rate': '4.96e-05', 'epoch': '0.3783', 'num_input_tokens_seen': 30758222, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3257', 'grad_norm': '0.9319', 'learning_rate': '4.96e-05', 'epoch': '0.3784', 'num_input_tokens_seen': 30760269, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2782', 'grad_norm': '0.8754', 'learning_rate': '4.96e-05', 'epoch': '0.3784', 'num_input_tokens_seen': 30762316, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7937', 'grad_norm': '1.761', 'learning_rate': '4.96e-05', 'epoch': '0.3784', 'num_input_tokens_seen': 30764363, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2108', 'grad_norm': '0.8633', 'learning_rate': '4.96e-05', 'epoch': '0.3784', 'num_input_tokens_seen': 30766410, 'train_runtime': '1.557e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3533', 'grad_norm': '0.9721', 'learning_rate': '4.96e-05', 'epoch': '0.3785', 'num_input_tokens_seen': 30768457, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4492', 'grad_norm': '2.817', 'learning_rate': '4.96e-05', 'epoch': '0.3785', 'num_input_tokens_seen': 30770504, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.362', 'grad_norm': '2.709', 'learning_rate': '4.96e-05', 'epoch': '0.3785', 'num_input_tokens_seen': 30772551, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3055', 'grad_norm': '0.9565', 'learning_rate': '4.96e-05', 'epoch': '0.3785', 'num_input_tokens_seen': 30774598, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3519', 'grad_norm': '1.01', 'learning_rate': '4.96e-05', 'epoch': '0.3786', 'num_input_tokens_seen': 30776645, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.428', 'grad_norm': '2.574', 'learning_rate': '4.96e-05', 'epoch': '0.3786', 'num_input_tokens_seen': 30778692, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.382', 'grad_norm': '2.218', 'learning_rate': '4.96e-05', 'epoch': '0.3786', 'num_input_tokens_seen': 30780739, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2067', 'grad_norm': '0.803', 'learning_rate': '4.96e-05', 'epoch': '0.3786', 'num_input_tokens_seen': 30782786, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7707', 'grad_norm': '1.296', 'learning_rate': '4.96e-05', 'epoch': '0.3787', 'num_input_tokens_seen': 30784833, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.913', 'grad_norm': '1.746', 'learning_rate': '4.96e-05', 'epoch': '0.3787', 'num_input_tokens_seen': 30786880, 'train_runtime': '1.558e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3035', 'grad_norm': '1.103', 'learning_rate': '4.96e-05', 'epoch': '0.3787', 'num_input_tokens_seen': 30788927, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5262', 'grad_norm': '1.438', 'learning_rate': '4.96e-05', 'epoch': '0.3787', 'num_input_tokens_seen': 30790974, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2641', 'grad_norm': '0.7844', 'learning_rate': '4.96e-05', 'epoch': '0.3788', 'num_input_tokens_seen': 30793021, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1879', 'grad_norm': '0.7808', 'learning_rate': '4.96e-05', 'epoch': '0.3788', 'num_input_tokens_seen': 30795068, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3723', 'grad_norm': '0.9885', 'learning_rate': '4.96e-05', 'epoch': '0.3788', 'num_input_tokens_seen': 30797115, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3177', 'grad_norm': '0.8537', 'learning_rate': '4.96e-05', 'epoch': '0.3788', 'num_input_tokens_seen': 30799162, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3058', 'grad_norm': '0.9243', 'learning_rate': '4.96e-05', 'epoch': '0.3789', 'num_input_tokens_seen': 30801209, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9334', 'grad_norm': '2.221', 'learning_rate': '4.96e-05', 'epoch': '0.3789', 'num_input_tokens_seen': 30803256, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.535', 'grad_norm': '2.036', 'learning_rate': '4.96e-05', 'epoch': '0.3789', 'num_input_tokens_seen': 30805303, 'train_runtime': '1.559e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7907', 'grad_norm': '1.372', 'learning_rate': '4.96e-05', 'epoch': '0.3789', 'num_input_tokens_seen': 30807350, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9158', 'grad_norm': '2.134', 'learning_rate': '4.96e-05', 'epoch': '0.379', 'num_input_tokens_seen': 30809397, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5719', 'grad_norm': '1.327', 'learning_rate': '4.96e-05', 'epoch': '0.379', 'num_input_tokens_seen': 30811444, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2479', 'grad_norm': '0.8268', 'learning_rate': '4.96e-05', 'epoch': '0.379', 'num_input_tokens_seen': 30813491, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.257', 'grad_norm': '2.441', 'learning_rate': '4.96e-05', 'epoch': '0.379', 'num_input_tokens_seen': 30815538, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7359', 'grad_norm': '2.236', 'learning_rate': '4.96e-05', 'epoch': '0.3791', 'num_input_tokens_seen': 30817585, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2032', 'grad_norm': '0.7446', 'learning_rate': '4.96e-05', 'epoch': '0.3791', 'num_input_tokens_seen': 30819632, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2724', 'grad_norm': '0.7761', 'learning_rate': '4.96e-05', 'epoch': '0.3791', 'num_input_tokens_seen': 30821679, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8807', 'grad_norm': '1.617', 'learning_rate': '4.96e-05', 'epoch': '0.3791', 'num_input_tokens_seen': 30823726, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6682', 'grad_norm': '1.151', 'learning_rate': '4.96e-05', 'epoch': '0.3792', 'num_input_tokens_seen': 30825773, 'train_runtime': '1.56e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8242', 'grad_norm': '1.5', 'learning_rate': '4.96e-05', 'epoch': '0.3792', 'num_input_tokens_seen': 30827820, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2017', 'grad_norm': '0.7846', 'learning_rate': '4.96e-05', 'epoch': '0.3792', 'num_input_tokens_seen': 30829867, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5261', 'grad_norm': '1.189', 'learning_rate': '4.96e-05', 'epoch': '0.3792', 'num_input_tokens_seen': 30831914, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.858', 'grad_norm': '1.576', 'learning_rate': '4.96e-05', 'epoch': '0.3793', 'num_input_tokens_seen': 30833961, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8092', 'grad_norm': '1.637', 'learning_rate': '4.96e-05', 'epoch': '0.3793', 'num_input_tokens_seen': 30836008, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.228', 'grad_norm': '3.598', 'learning_rate': '4.96e-05', 'epoch': '0.3793', 'num_input_tokens_seen': 30838055, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6077', 'grad_norm': '1.279', 'learning_rate': '4.96e-05', 'epoch': '0.3793', 'num_input_tokens_seen': 30840102, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2101', 'grad_norm': '0.7632', 'learning_rate': '4.96e-05', 'epoch': '0.3794', 'num_input_tokens_seen': 30842149, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.309', 'grad_norm': '2.399', 'learning_rate': '4.96e-05', 'epoch': '0.3794', 'num_input_tokens_seen': 30844196, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6267', 'grad_norm': '1.911', 'learning_rate': '4.96e-05', 'epoch': '0.3794', 'num_input_tokens_seen': 30846243, 'train_runtime': '1.561e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.018', 'grad_norm': '1.447', 'learning_rate': '4.96e-05', 'epoch': '0.3794', 'num_input_tokens_seen': 30848290, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5448', 'grad_norm': '0.8823', 'learning_rate': '4.96e-05', 'epoch': '0.3795', 'num_input_tokens_seen': 30850337, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.44', 'grad_norm': '2.063', 'learning_rate': '4.96e-05', 'epoch': '0.3795', 'num_input_tokens_seen': 30852384, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3346', 'grad_norm': '0.9312', 'learning_rate': '4.96e-05', 'epoch': '0.3795', 'num_input_tokens_seen': 30854431, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6844', 'grad_norm': '1.362', 'learning_rate': '4.96e-05', 'epoch': '0.3795', 'num_input_tokens_seen': 30856478, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3946', 'grad_norm': '0.9942', 'learning_rate': '4.96e-05', 'epoch': '0.3796', 'num_input_tokens_seen': 30858525, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.217', 'grad_norm': '2.275', 'learning_rate': '4.96e-05', 'epoch': '0.3796', 'num_input_tokens_seen': 30860572, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1945', 'grad_norm': '0.8371', 'learning_rate': '4.96e-05', 'epoch': '0.3796', 'num_input_tokens_seen': 30862619, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3857', 'grad_norm': '1.194', 'learning_rate': '4.96e-05', 'epoch': '0.3796', 'num_input_tokens_seen': 30864666, 'train_runtime': '1.562e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2507', 'grad_norm': '0.852', 'learning_rate': '4.96e-05', 'epoch': '0.3797', 'num_input_tokens_seen': 30866713, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8428', 'grad_norm': '1.298', 'learning_rate': '4.96e-05', 'epoch': '0.3797', 'num_input_tokens_seen': 30868760, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.607', 'grad_norm': '0.9621', 'learning_rate': '4.96e-05', 'epoch': '0.3797', 'num_input_tokens_seen': 30870807, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5439', 'grad_norm': '1.443', 'learning_rate': '4.96e-05', 'epoch': '0.3797', 'num_input_tokens_seen': 30872854, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8639', 'grad_norm': '1.973', 'learning_rate': '4.96e-05', 'epoch': '0.3798', 'num_input_tokens_seen': 30874901, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4625', 'grad_norm': '0.9043', 'learning_rate': '4.96e-05', 'epoch': '0.3798', 'num_input_tokens_seen': 30876948, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2284', 'grad_norm': '0.8446', 'learning_rate': '4.96e-05', 'epoch': '0.3798', 'num_input_tokens_seen': 30878995, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3124', 'grad_norm': '0.8203', 'learning_rate': '4.96e-05', 'epoch': '0.3798', 'num_input_tokens_seen': 30881042, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7149', 'grad_norm': '1.043', 'learning_rate': '4.96e-05', 'epoch': '0.3799', 'num_input_tokens_seen': 30883089, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7918', 'grad_norm': '1.614', 'learning_rate': '4.96e-05', 'epoch': '0.3799', 'num_input_tokens_seen': 30885136, 'train_runtime': '1.563e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3332', 'grad_norm': '0.81', 'learning_rate': '4.96e-05', 'epoch': '0.3799', 'num_input_tokens_seen': 30887183, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7888', 'grad_norm': '1.266', 'learning_rate': '4.96e-05', 'epoch': '0.3799', 'num_input_tokens_seen': 30889230, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8226', 'grad_norm': '1.731', 'learning_rate': '4.96e-05', 'epoch': '0.38', 'num_input_tokens_seen': 30891277, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6733', 'grad_norm': '1.416', 'learning_rate': '4.96e-05', 'epoch': '0.38', 'num_input_tokens_seen': 30893324, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.543', 'grad_norm': '2.317', 'learning_rate': '4.96e-05', 'epoch': '0.38', 'num_input_tokens_seen': 30895371, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6353', 'grad_norm': '1.393', 'learning_rate': '4.96e-05', 'epoch': '0.38', 'num_input_tokens_seen': 30897418, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2684', 'grad_norm': '0.8969', 'learning_rate': '4.96e-05', 'epoch': '0.3801', 'num_input_tokens_seen': 30899465, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9283', 'grad_norm': '1.257', 'learning_rate': '4.96e-05', 'epoch': '0.3801', 'num_input_tokens_seen': 30901512, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7045', 'grad_norm': '1.524', 'learning_rate': '4.96e-05', 'epoch': '0.3801', 'num_input_tokens_seen': 30903559, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1446', 'grad_norm': '0.7879', 'learning_rate': '4.96e-05', 'epoch': '0.3801', 'num_input_tokens_seen': 30905606, 'train_runtime': '1.564e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2668', 'grad_norm': '0.965', 'learning_rate': '4.96e-05', 'epoch': '0.3802', 'num_input_tokens_seen': 30907653, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8279', 'grad_norm': '2.237', 'learning_rate': '4.96e-05', 'epoch': '0.3802', 'num_input_tokens_seen': 30909700, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.023', 'grad_norm': '1.373', 'learning_rate': '4.96e-05', 'epoch': '0.3802', 'num_input_tokens_seen': 30911747, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2782', 'grad_norm': '0.9701', 'learning_rate': '4.96e-05', 'epoch': '0.3802', 'num_input_tokens_seen': 30913794, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.223', 'grad_norm': '1.493', 'learning_rate': '4.96e-05', 'epoch': '0.3803', 'num_input_tokens_seen': 30915841, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6743', 'grad_norm': '1.374', 'learning_rate': '4.96e-05', 'epoch': '0.3803', 'num_input_tokens_seen': 30917888, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4147', 'grad_norm': '1.064', 'learning_rate': '4.96e-05', 'epoch': '0.3803', 'num_input_tokens_seen': 30919935, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.162', 'grad_norm': '1.845', 'learning_rate': '4.96e-05', 'epoch': '0.3803', 'num_input_tokens_seen': 30921982, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3443', 'grad_norm': '1.141', 'learning_rate': '4.96e-05', 'epoch': '0.3804', 'num_input_tokens_seen': 30924029, 'train_runtime': '1.565e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.727', 'grad_norm': '2.376', 'learning_rate': '4.96e-05', 'epoch': '0.3804', 'num_input_tokens_seen': 30926076, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3396', 'grad_norm': '0.8972', 'learning_rate': '4.96e-05', 'epoch': '0.3804', 'num_input_tokens_seen': 30928123, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5609', 'grad_norm': '1.02', 'learning_rate': '4.96e-05', 'epoch': '0.3804', 'num_input_tokens_seen': 30930170, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2994', 'grad_norm': '0.8415', 'learning_rate': '4.96e-05', 'epoch': '0.3805', 'num_input_tokens_seen': 30932217, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.149', 'grad_norm': '1.763', 'learning_rate': '4.96e-05', 'epoch': '0.3805', 'num_input_tokens_seen': 30934264, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5275', 'grad_norm': '1.154', 'learning_rate': '4.96e-05', 'epoch': '0.3805', 'num_input_tokens_seen': 30936311, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2632', 'grad_norm': '0.7648', 'learning_rate': '4.96e-05', 'epoch': '0.3805', 'num_input_tokens_seen': 30938358, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5169', 'grad_norm': '1.13', 'learning_rate': '4.96e-05', 'epoch': '0.3806', 'num_input_tokens_seen': 30940405, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3635', 'grad_norm': '1.157', 'learning_rate': '4.96e-05', 'epoch': '0.3806', 'num_input_tokens_seen': 30942452, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3787', 'grad_norm': '0.9608', 'learning_rate': '4.96e-05', 'epoch': '0.3806', 'num_input_tokens_seen': 30944499, 'train_runtime': '1.566e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.124', 'grad_norm': '1.991', 'learning_rate': '4.96e-05', 'epoch': '0.3806', 'num_input_tokens_seen': 30946546, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3803', 'grad_norm': '1.035', 'learning_rate': '4.96e-05', 'epoch': '0.3807', 'num_input_tokens_seen': 30948593, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3777', 'grad_norm': '0.8707', 'learning_rate': '4.96e-05', 'epoch': '0.3807', 'num_input_tokens_seen': 30950640, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.942', 'grad_norm': '2.909', 'learning_rate': '4.96e-05', 'epoch': '0.3807', 'num_input_tokens_seen': 30952687, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7051', 'grad_norm': '1.442', 'learning_rate': '4.96e-05', 'epoch': '0.3807', 'num_input_tokens_seen': 30954734, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4516', 'grad_norm': '0.8849', 'learning_rate': '4.96e-05', 'epoch': '0.3808', 'num_input_tokens_seen': 30956781, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6961', 'grad_norm': '1.336', 'learning_rate': '4.96e-05', 'epoch': '0.3808', 'num_input_tokens_seen': 30958828, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3713', 'grad_norm': '1.04', 'learning_rate': '4.96e-05', 'epoch': '0.3808', 'num_input_tokens_seen': 30960875, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6966', 'grad_norm': '1.386', 'learning_rate': '4.96e-05', 'epoch': '0.3808', 'num_input_tokens_seen': 30962922, 'train_runtime': '1.567e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3167', 'grad_norm': '0.6958', 'learning_rate': '4.96e-05', 'epoch': '0.3809', 'num_input_tokens_seen': 30964969, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5186', 'grad_norm': '1.385', 'learning_rate': '4.96e-05', 'epoch': '0.3809', 'num_input_tokens_seen': 30967016, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7062', 'grad_norm': '1.521', 'learning_rate': '4.96e-05', 'epoch': '0.3809', 'num_input_tokens_seen': 30969063, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2673', 'grad_norm': '0.8347', 'learning_rate': '4.96e-05', 'epoch': '0.3809', 'num_input_tokens_seen': 30971110, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5904', 'grad_norm': '1.048', 'learning_rate': '4.96e-05', 'epoch': '0.381', 'num_input_tokens_seen': 30973157, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4874', 'grad_norm': '1.252', 'learning_rate': '4.96e-05', 'epoch': '0.381', 'num_input_tokens_seen': 30975204, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7779', 'grad_norm': '1.504', 'learning_rate': '4.96e-05', 'epoch': '0.381', 'num_input_tokens_seen': 30977251, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.17', 'grad_norm': '1.992', 'learning_rate': '4.96e-05', 'epoch': '0.381', 'num_input_tokens_seen': 30979298, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7301', 'grad_norm': '1.73', 'learning_rate': '4.96e-05', 'epoch': '0.3811', 'num_input_tokens_seen': 30981345, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6402', 'grad_norm': '1.253', 'learning_rate': '4.96e-05', 'epoch': '0.3811', 'num_input_tokens_seen': 30983392, 'train_runtime': '1.568e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8883', 'grad_norm': '1.753', 'learning_rate': '4.96e-05', 'epoch': '0.3811', 'num_input_tokens_seen': 30985439, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8605', 'grad_norm': '1.416', 'learning_rate': '4.96e-05', 'epoch': '0.3811', 'num_input_tokens_seen': 30987486, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2278', 'grad_norm': '0.8281', 'learning_rate': '4.959e-05', 'epoch': '0.3812', 'num_input_tokens_seen': 30989533, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9964', 'grad_norm': '1.826', 'learning_rate': '4.959e-05', 'epoch': '0.3812', 'num_input_tokens_seen': 30991580, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5386', 'grad_norm': '1.118', 'learning_rate': '4.959e-05', 'epoch': '0.3812', 'num_input_tokens_seen': 30993627, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9331', 'grad_norm': '1.705', 'learning_rate': '4.959e-05', 'epoch': '0.3812', 'num_input_tokens_seen': 30995674, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.298', 'grad_norm': '0.9956', 'learning_rate': '4.959e-05', 'epoch': '0.3813', 'num_input_tokens_seen': 30997721, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9175', 'grad_norm': '1.336', 'learning_rate': '4.959e-05', 'epoch': '0.3813', 'num_input_tokens_seen': 30999768, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3582', 'grad_norm': '0.9768', 'learning_rate': '4.959e-05', 'epoch': '0.3813', 'num_input_tokens_seen': 31001815, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3281', 'grad_norm': '0.8747', 'learning_rate': '4.959e-05', 'epoch': '0.3813', 'num_input_tokens_seen': 31003862, 'train_runtime': '1.569e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.241', 'grad_norm': '2.594', 'learning_rate': '4.959e-05', 'epoch': '0.3814', 'num_input_tokens_seen': 31005909, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.84', 'grad_norm': '1.343', 'learning_rate': '4.959e-05', 'epoch': '0.3814', 'num_input_tokens_seen': 31007956, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4075', 'grad_norm': '0.9433', 'learning_rate': '4.959e-05', 'epoch': '0.3814', 'num_input_tokens_seen': 31010003, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7278', 'grad_norm': '1.231', 'learning_rate': '4.959e-05', 'epoch': '0.3814', 'num_input_tokens_seen': 31012050, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6245', 'grad_norm': '1.191', 'learning_rate': '4.959e-05', 'epoch': '0.3815', 'num_input_tokens_seen': 31014097, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4697', 'grad_norm': '1.168', 'learning_rate': '4.959e-05', 'epoch': '0.3815', 'num_input_tokens_seen': 31016144, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.329', 'grad_norm': '0.9123', 'learning_rate': '4.959e-05', 'epoch': '0.3815', 'num_input_tokens_seen': 31018191, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3047', 'grad_norm': '0.9863', 'learning_rate': '4.959e-05', 'epoch': '0.3815', 'num_input_tokens_seen': 31020238, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.01', 'grad_norm': '1.699', 'learning_rate': '4.959e-05', 'epoch': '0.3816', 'num_input_tokens_seen': 31022285, 'train_runtime': '1.57e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6069', 'grad_norm': '1.166', 'learning_rate': '4.959e-05', 'epoch': '0.3816', 'num_input_tokens_seen': 31024332, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.203', 'grad_norm': '2.036', 'learning_rate': '4.959e-05', 'epoch': '0.3816', 'num_input_tokens_seen': 31026379, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7765', 'grad_norm': '1.403', 'learning_rate': '4.959e-05', 'epoch': '0.3817', 'num_input_tokens_seen': 31028426, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2194', 'grad_norm': '0.6838', 'learning_rate': '4.959e-05', 'epoch': '0.3817', 'num_input_tokens_seen': 31030473, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7034', 'grad_norm': '1.431', 'learning_rate': '4.959e-05', 'epoch': '0.3817', 'num_input_tokens_seen': 31032520, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4039', 'grad_norm': '1.289', 'learning_rate': '4.959e-05', 'epoch': '0.3817', 'num_input_tokens_seen': 31034567, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.22', 'grad_norm': '2.246', 'learning_rate': '4.959e-05', 'epoch': '0.3818', 'num_input_tokens_seen': 31036614, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6962', 'grad_norm': '1.169', 'learning_rate': '4.959e-05', 'epoch': '0.3818', 'num_input_tokens_seen': 31038661, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.573', 'grad_norm': '1.227', 'learning_rate': '4.959e-05', 'epoch': '0.3818', 'num_input_tokens_seen': 31040708, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7803', 'grad_norm': '1.527', 'learning_rate': '4.959e-05', 'epoch': '0.3818', 'num_input_tokens_seen': 31042755, 'train_runtime': '1.571e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2424', 'grad_norm': '0.8397', 'learning_rate': '4.959e-05', 'epoch': '0.3819', 'num_input_tokens_seen': 31044802, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8675', 'grad_norm': '1.934', 'learning_rate': '4.959e-05', 'epoch': '0.3819', 'num_input_tokens_seen': 31046849, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2934', 'grad_norm': '0.9977', 'learning_rate': '4.959e-05', 'epoch': '0.3819', 'num_input_tokens_seen': 31048896, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3208', 'grad_norm': '0.8417', 'learning_rate': '4.959e-05', 'epoch': '0.3819', 'num_input_tokens_seen': 31050943, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2508', 'grad_norm': '0.7977', 'learning_rate': '4.959e-05', 'epoch': '0.382', 'num_input_tokens_seen': 31052990, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2928', 'grad_norm': '0.8362', 'learning_rate': '4.959e-05', 'epoch': '0.382', 'num_input_tokens_seen': 31055037, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2393', 'grad_norm': '0.8172', 'learning_rate': '4.959e-05', 'epoch': '0.382', 'num_input_tokens_seen': 31057084, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.229', 'grad_norm': '2.608', 'learning_rate': '4.959e-05', 'epoch': '0.382', 'num_input_tokens_seen': 31059131, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9598', 'grad_norm': '2.451', 'learning_rate': '4.959e-05', 'epoch': '0.3821', 'num_input_tokens_seen': 31061178, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5416', 'grad_norm': '0.9031', 'learning_rate': '4.959e-05', 'epoch': '0.3821', 'num_input_tokens_seen': 31063225, 'train_runtime': '1.572e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7051', 'grad_norm': '1.22', 'learning_rate': '4.959e-05', 'epoch': '0.3821', 'num_input_tokens_seen': 31065272, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.337', 'grad_norm': '1.003', 'learning_rate': '4.959e-05', 'epoch': '0.3821', 'num_input_tokens_seen': 31067319, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.356', 'grad_norm': '2.244', 'learning_rate': '4.959e-05', 'epoch': '0.3822', 'num_input_tokens_seen': 31069366, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9407', 'grad_norm': '2.116', 'learning_rate': '4.959e-05', 'epoch': '0.3822', 'num_input_tokens_seen': 31071413, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.618', 'grad_norm': '2.475', 'learning_rate': '4.959e-05', 'epoch': '0.3822', 'num_input_tokens_seen': 31073460, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3671', 'grad_norm': '0.8293', 'learning_rate': '4.959e-05', 'epoch': '0.3822', 'num_input_tokens_seen': 31075507, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3013', 'grad_norm': '0.9565', 'learning_rate': '4.959e-05', 'epoch': '0.3823', 'num_input_tokens_seen': 31077554, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5011', 'grad_norm': '1.079', 'learning_rate': '4.959e-05', 'epoch': '0.3823', 'num_input_tokens_seen': 31079601, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7916', 'grad_norm': '1.577', 'learning_rate': '4.959e-05', 'epoch': '0.3823', 'num_input_tokens_seen': 31081648, 'train_runtime': '1.573e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7753', 'grad_norm': '1.125', 'learning_rate': '4.959e-05', 'epoch': '0.3823', 'num_input_tokens_seen': 31083695, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3974', 'grad_norm': '0.8592', 'learning_rate': '4.959e-05', 'epoch': '0.3824', 'num_input_tokens_seen': 31085742, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.409', 'grad_norm': '1.164', 'learning_rate': '4.959e-05', 'epoch': '0.3824', 'num_input_tokens_seen': 31087789, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7268', 'grad_norm': '0.9966', 'learning_rate': '4.959e-05', 'epoch': '0.3824', 'num_input_tokens_seen': 31089836, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3167', 'grad_norm': '0.8091', 'learning_rate': '4.959e-05', 'epoch': '0.3824', 'num_input_tokens_seen': 31091883, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2877', 'grad_norm': '0.7969', 'learning_rate': '4.959e-05', 'epoch': '0.3825', 'num_input_tokens_seen': 31093930, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.245', 'grad_norm': '0.7897', 'learning_rate': '4.959e-05', 'epoch': '0.3825', 'num_input_tokens_seen': 31095977, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6747', 'grad_norm': '1.093', 'learning_rate': '4.959e-05', 'epoch': '0.3825', 'num_input_tokens_seen': 31098024, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.436', 'grad_norm': '2.822', 'learning_rate': '4.959e-05', 'epoch': '0.3825', 'num_input_tokens_seen': 31100071, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.055', 'grad_norm': '1.944', 'learning_rate': '4.959e-05', 'epoch': '0.3826', 'num_input_tokens_seen': 31102118, 'train_runtime': '1.574e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4269', 'grad_norm': '1.244', 'learning_rate': '4.959e-05', 'epoch': '0.3826', 'num_input_tokens_seen': 31104165, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7125', 'grad_norm': '1.347', 'learning_rate': '4.959e-05', 'epoch': '0.3826', 'num_input_tokens_seen': 31106212, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7276', 'grad_norm': '1.25', 'learning_rate': '4.959e-05', 'epoch': '0.3826', 'num_input_tokens_seen': 31108259, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4848', 'grad_norm': '0.9715', 'learning_rate': '4.959e-05', 'epoch': '0.3827', 'num_input_tokens_seen': 31110306, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4295', 'grad_norm': '1.223', 'learning_rate': '4.959e-05', 'epoch': '0.3827', 'num_input_tokens_seen': 31112353, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4585', 'grad_norm': '1.116', 'learning_rate': '4.959e-05', 'epoch': '0.3827', 'num_input_tokens_seen': 31114400, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2908', 'grad_norm': '0.8627', 'learning_rate': '4.959e-05', 'epoch': '0.3827', 'num_input_tokens_seen': 31116447, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9316', 'grad_norm': '1.37', 'learning_rate': '4.959e-05', 'epoch': '0.3828', 'num_input_tokens_seen': 31118494, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4107', 'grad_norm': '1.028', 'learning_rate': '4.959e-05', 'epoch': '0.3828', 'num_input_tokens_seen': 31120541, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3637', 'grad_norm': '0.879', 'learning_rate': '4.959e-05', 'epoch': '0.3828', 'num_input_tokens_seen': 31122588, 'train_runtime': '1.575e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.611', 'grad_norm': '2.459', 'learning_rate': '4.959e-05', 'epoch': '0.3828', 'num_input_tokens_seen': 31124635, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '2.474', 'grad_norm': '2.741', 'learning_rate': '4.959e-05', 'epoch': '0.3829', 'num_input_tokens_seen': 31126682, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.328', 'grad_norm': '1.966', 'learning_rate': '4.959e-05', 'epoch': '0.3829', 'num_input_tokens_seen': 31128729, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7905', 'grad_norm': '0.942', 'learning_rate': '4.959e-05', 'epoch': '0.3829', 'num_input_tokens_seen': 31130776, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5446', 'grad_norm': '1.227', 'learning_rate': '4.959e-05', 'epoch': '0.3829', 'num_input_tokens_seen': 31132823, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7814', 'grad_norm': '1.809', 'learning_rate': '4.959e-05', 'epoch': '0.383', 'num_input_tokens_seen': 31134870, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8926', 'grad_norm': '1.543', 'learning_rate': '4.959e-05', 'epoch': '0.383', 'num_input_tokens_seen': 31136917, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3984', 'grad_norm': '1.086', 'learning_rate': '4.959e-05', 'epoch': '0.383', 'num_input_tokens_seen': 31138964, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4269', 'grad_norm': '0.9542', 'learning_rate': '4.959e-05', 'epoch': '0.383', 'num_input_tokens_seen': 31141011, 'train_runtime': '1.576e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.155', 'grad_norm': '0.857', 'learning_rate': '4.959e-05', 'epoch': '0.3831', 'num_input_tokens_seen': 31143058, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.387', 'grad_norm': '2.283', 'learning_rate': '4.959e-05', 'epoch': '0.3831', 'num_input_tokens_seen': 31145105, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4951', 'grad_norm': '1.012', 'learning_rate': '4.959e-05', 'epoch': '0.3831', 'num_input_tokens_seen': 31147152, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3097', 'grad_norm': '0.8178', 'learning_rate': '4.959e-05', 'epoch': '0.3831', 'num_input_tokens_seen': 31149199, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4527', 'grad_norm': '1.065', 'learning_rate': '4.959e-05', 'epoch': '0.3832', 'num_input_tokens_seen': 31151246, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8022', 'grad_norm': '1.51', 'learning_rate': '4.959e-05', 'epoch': '0.3832', 'num_input_tokens_seen': 31153293, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3682', 'grad_norm': '0.9628', 'learning_rate': '4.959e-05', 'epoch': '0.3832', 'num_input_tokens_seen': 31155340, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.113', 'grad_norm': '2.555', 'learning_rate': '4.959e-05', 'epoch': '0.3832', 'num_input_tokens_seen': 31157387, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.299', 'grad_norm': '2.291', 'learning_rate': '4.959e-05', 'epoch': '0.3833', 'num_input_tokens_seen': 31159434, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3273', 'grad_norm': '0.8565', 'learning_rate': '4.959e-05', 'epoch': '0.3833', 'num_input_tokens_seen': 31161481, 'train_runtime': '1.577e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7624', 'grad_norm': '1.237', 'learning_rate': '4.959e-05', 'epoch': '0.3833', 'num_input_tokens_seen': 31163528, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1603', 'grad_norm': '0.7467', 'learning_rate': '4.959e-05', 'epoch': '0.3833', 'num_input_tokens_seen': 31165575, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6759', 'grad_norm': '1.223', 'learning_rate': '4.959e-05', 'epoch': '0.3834', 'num_input_tokens_seen': 31167622, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2802', 'grad_norm': '0.9252', 'learning_rate': '4.959e-05', 'epoch': '0.3834', 'num_input_tokens_seen': 31169669, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8978', 'grad_norm': '1.799', 'learning_rate': '4.959e-05', 'epoch': '0.3834', 'num_input_tokens_seen': 31171716, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.219', 'grad_norm': '1.813', 'learning_rate': '4.959e-05', 'epoch': '0.3834', 'num_input_tokens_seen': 31173763, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2794', 'grad_norm': '0.9098', 'learning_rate': '4.959e-05', 'epoch': '0.3835', 'num_input_tokens_seen': 31175810, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.748', 'grad_norm': '2.789', 'learning_rate': '4.959e-05', 'epoch': '0.3835', 'num_input_tokens_seen': 31177857, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.013', 'grad_norm': '1.891', 'learning_rate': '4.959e-05', 'epoch': '0.3835', 'num_input_tokens_seen': 31179904, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3659', 'grad_norm': '0.7996', 'learning_rate': '4.959e-05', 'epoch': '0.3835', 'num_input_tokens_seen': 31181951, 'train_runtime': '1.578e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.584', 'grad_norm': '2.6', 'learning_rate': '4.959e-05', 'epoch': '0.3836', 'num_input_tokens_seen': 31183998, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6809', 'grad_norm': '1.097', 'learning_rate': '4.959e-05', 'epoch': '0.3836', 'num_input_tokens_seen': 31186045, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8223', 'grad_norm': '1.118', 'learning_rate': '4.959e-05', 'epoch': '0.3836', 'num_input_tokens_seen': 31188092, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3297', 'grad_norm': '0.8978', 'learning_rate': '4.959e-05', 'epoch': '0.3836', 'num_input_tokens_seen': 31190139, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3332', 'grad_norm': '1.207', 'learning_rate': '4.959e-05', 'epoch': '0.3837', 'num_input_tokens_seen': 31192186, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.417', 'grad_norm': '1.117', 'learning_rate': '4.959e-05', 'epoch': '0.3837', 'num_input_tokens_seen': 31194233, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.051', 'grad_norm': '1.703', 'learning_rate': '4.959e-05', 'epoch': '0.3837', 'num_input_tokens_seen': 31196280, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.514', 'grad_norm': '2.328', 'learning_rate': '4.959e-05', 'epoch': '0.3837', 'num_input_tokens_seen': 31198327, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2083', 'grad_norm': '0.8309', 'learning_rate': '4.959e-05', 'epoch': '0.3838', 'num_input_tokens_seen': 31200374, 'train_runtime': '1.579e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.275', 'grad_norm': '2.531', 'learning_rate': '4.959e-05', 'epoch': '0.3838', 'num_input_tokens_seen': 31202421, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8887', 'grad_norm': '1.942', 'learning_rate': '4.959e-05', 'epoch': '0.3838', 'num_input_tokens_seen': 31204468, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4942', 'grad_norm': '1.131', 'learning_rate': '4.959e-05', 'epoch': '0.3838', 'num_input_tokens_seen': 31206515, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2878', 'grad_norm': '0.7901', 'learning_rate': '4.959e-05', 'epoch': '0.3839', 'num_input_tokens_seen': 31208562, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.713', 'grad_norm': '2.595', 'learning_rate': '4.959e-05', 'epoch': '0.3839', 'num_input_tokens_seen': 31210609, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7939', 'grad_norm': '1.274', 'learning_rate': '4.959e-05', 'epoch': '0.3839', 'num_input_tokens_seen': 31212656, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3396', 'grad_norm': '0.8457', 'learning_rate': '4.959e-05', 'epoch': '0.3839', 'num_input_tokens_seen': 31214703, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.043', 'grad_norm': '1.675', 'learning_rate': '4.959e-05', 'epoch': '0.384', 'num_input_tokens_seen': 31216750, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6291', 'grad_norm': '1.04', 'learning_rate': '4.959e-05', 'epoch': '0.384', 'num_input_tokens_seen': 31218797, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3615', 'grad_norm': '0.8616', 'learning_rate': '4.959e-05', 'epoch': '0.384', 'num_input_tokens_seen': 31220844, 'train_runtime': '1.58e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6317', 'grad_norm': '1.166', 'learning_rate': '4.959e-05', 'epoch': '0.384', 'num_input_tokens_seen': 31222891, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5688', 'grad_norm': '1.014', 'learning_rate': '4.959e-05', 'epoch': '0.3841', 'num_input_tokens_seen': 31224938, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6822', 'grad_norm': '1.17', 'learning_rate': '4.959e-05', 'epoch': '0.3841', 'num_input_tokens_seen': 31226985, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3838', 'grad_norm': '1.168', 'learning_rate': '4.959e-05', 'epoch': '0.3841', 'num_input_tokens_seen': 31229032, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5168', 'grad_norm': '1.202', 'learning_rate': '4.959e-05', 'epoch': '0.3841', 'num_input_tokens_seen': 31231079, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.063', 'grad_norm': '1.96', 'learning_rate': '4.959e-05', 'epoch': '0.3842', 'num_input_tokens_seen': 31233126, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9848', 'grad_norm': '2.28', 'learning_rate': '4.959e-05', 'epoch': '0.3842', 'num_input_tokens_seen': 31235173, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.073', 'grad_norm': '1.666', 'learning_rate': '4.959e-05', 'epoch': '0.3842', 'num_input_tokens_seen': 31237220, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5074', 'grad_norm': '1.164', 'learning_rate': '4.959e-05', 'epoch': '0.3842', 'num_input_tokens_seen': 31239267, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5022', 'grad_norm': '1.434', 'learning_rate': '4.959e-05', 'epoch': '0.3843', 'num_input_tokens_seen': 31241314, 'train_runtime': '1.581e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7253', 'grad_norm': '1.125', 'learning_rate': '4.959e-05', 'epoch': '0.3843', 'num_input_tokens_seen': 31243361, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.26', 'grad_norm': '0.9261', 'learning_rate': '4.959e-05', 'epoch': '0.3843', 'num_input_tokens_seen': 31245408, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.208', 'grad_norm': '2.367', 'learning_rate': '4.959e-05', 'epoch': '0.3843', 'num_input_tokens_seen': 31247455, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.252', 'grad_norm': '2.562', 'learning_rate': '4.959e-05', 'epoch': '0.3844', 'num_input_tokens_seen': 31249502, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.266', 'grad_norm': '0.8119', 'learning_rate': '4.959e-05', 'epoch': '0.3844', 'num_input_tokens_seen': 31251549, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3184', 'grad_norm': '0.8313', 'learning_rate': '4.959e-05', 'epoch': '0.3844', 'num_input_tokens_seen': 31253596, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8002', 'grad_norm': '1.791', 'learning_rate': '4.959e-05', 'epoch': '0.3844', 'num_input_tokens_seen': 31255643, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8711', 'grad_norm': '1.425', 'learning_rate': '4.959e-05', 'epoch': '0.3845', 'num_input_tokens_seen': 31257690, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9303', 'grad_norm': '1.466', 'learning_rate': '4.959e-05', 'epoch': '0.3845', 'num_input_tokens_seen': 31259737, 'train_runtime': '1.582e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5037', 'grad_norm': '1.2', 'learning_rate': '4.959e-05', 'epoch': '0.3845', 'num_input_tokens_seen': 31261784, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3809', 'grad_norm': '0.9266', 'learning_rate': '4.959e-05', 'epoch': '0.3845', 'num_input_tokens_seen': 31263831, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3692', 'grad_norm': '0.9295', 'learning_rate': '4.959e-05', 'epoch': '0.3846', 'num_input_tokens_seen': 31265878, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.9593', 'grad_norm': '1.425', 'learning_rate': '4.959e-05', 'epoch': '0.3846', 'num_input_tokens_seen': 31267925, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2608', 'grad_norm': '0.8019', 'learning_rate': '4.958e-05', 'epoch': '0.3846', 'num_input_tokens_seen': 31269972, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4957', 'grad_norm': '1.073', 'learning_rate': '4.958e-05', 'epoch': '0.3846', 'num_input_tokens_seen': 31272019, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6111', 'grad_norm': '0.9966', 'learning_rate': '4.958e-05', 'epoch': '0.3847', 'num_input_tokens_seen': 31274066, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.746', 'grad_norm': '1.375', 'learning_rate': '4.958e-05', 'epoch': '0.3847', 'num_input_tokens_seen': 31276113, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2563', 'grad_norm': '0.8661', 'learning_rate': '4.958e-05', 'epoch': '0.3847', 'num_input_tokens_seen': 31278160, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6398', 'grad_norm': '1.808', 'learning_rate': '4.958e-05', 'epoch': '0.3847', 'num_input_tokens_seen': 31280207, 'train_runtime': '1.583e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3035', 'grad_norm': '0.8281', 'learning_rate': '4.958e-05', 'epoch': '0.3848', 'num_input_tokens_seen': 31282254, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5911', 'grad_norm': '1.369', 'learning_rate': '4.958e-05', 'epoch': '0.3848', 'num_input_tokens_seen': 31284301, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3941', 'grad_norm': '0.9193', 'learning_rate': '4.958e-05', 'epoch': '0.3848', 'num_input_tokens_seen': 31286348, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.02', 'grad_norm': '1.914', 'learning_rate': '4.958e-05', 'epoch': '0.3848', 'num_input_tokens_seen': 31288395, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6697', 'grad_norm': '1.324', 'learning_rate': '4.958e-05', 'epoch': '0.3849', 'num_input_tokens_seen': 31290442, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.301', 'grad_norm': '2.297', 'learning_rate': '4.958e-05', 'epoch': '0.3849', 'num_input_tokens_seen': 31292489, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4309', 'grad_norm': '0.7368', 'learning_rate': '4.958e-05', 'epoch': '0.3849', 'num_input_tokens_seen': 31294536, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5275', 'grad_norm': '1.062', 'learning_rate': '4.958e-05', 'epoch': '0.3849', 'num_input_tokens_seen': 31296583, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2957', 'grad_norm': '1.024', 'learning_rate': '4.958e-05', 'epoch': '0.385', 'num_input_tokens_seen': 31298630, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5363', 'grad_norm': '1.119', 'learning_rate': '4.958e-05', 'epoch': '0.385', 'num_input_tokens_seen': 31300677, 'train_runtime': '1.584e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6148', 'grad_norm': '1.327', 'learning_rate': '4.958e-05', 'epoch': '0.385', 'num_input_tokens_seen': 31302724, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.394', 'grad_norm': '3.588', 'learning_rate': '4.958e-05', 'epoch': '0.385', 'num_input_tokens_seen': 31304771, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7013', 'grad_norm': '1.347', 'learning_rate': '4.958e-05', 'epoch': '0.3851', 'num_input_tokens_seen': 31306818, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.3396', 'grad_norm': '0.95', 'learning_rate': '4.958e-05', 'epoch': '0.3851', 'num_input_tokens_seen': 31308865, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '1.322', 'grad_norm': '2.69', 'learning_rate': '4.958e-05', 'epoch': '0.3851', 'num_input_tokens_seen': 31310912, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.6101', 'grad_norm': '1.227', 'learning_rate': '4.958e-05', 'epoch': '0.3851', 'num_input_tokens_seen': 31312959, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '2.146', 'grad_norm': '2.74', 'learning_rate': '4.958e-05', 'epoch': '0.3852', 'num_input_tokens_seen': 31315006, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8883', 'grad_norm': '1.52', 'learning_rate': '4.958e-05', 'epoch': '0.3852', 'num_input_tokens_seen': 31317053, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.7281', 'grad_norm': '1.648', 'learning_rate': '4.958e-05', 'epoch': '0.3852', 'num_input_tokens_seen': 31319100, 'train_runtime': '1.585e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4333', 'grad_norm': '0.9783', 'learning_rate': '4.958e-05', 'epoch': '0.3853', 'num_input_tokens_seen': 31321147, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.417', 'grad_norm': '1.235', 'learning_rate': '4.958e-05', 'epoch': '0.3853', 'num_input_tokens_seen': 31323194, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4988', 'grad_norm': '0.9373', 'learning_rate': '4.958e-05', 'epoch': '0.3853', 'num_input_tokens_seen': 31325241, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1982', 'grad_norm': '0.7819', 'learning_rate': '4.958e-05', 'epoch': '0.3853', 'num_input_tokens_seen': 31327288, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.5976', 'grad_norm': '1.149', 'learning_rate': '4.958e-05', 'epoch': '0.3854', 'num_input_tokens_seen': 31329335, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.4578', 'grad_norm': '1.004', 'learning_rate': '4.958e-05', 'epoch': '0.3854', 'num_input_tokens_seen': 31331382, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2594', 'grad_norm': '0.8831', 'learning_rate': '4.958e-05', 'epoch': '0.3854', 'num_input_tokens_seen': 31333429, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.8629', 'grad_norm': '1.24', 'learning_rate': '4.958e-05', 'epoch': '0.3854', 'num_input_tokens_seen': 31335476, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.1956', 'grad_norm': '0.7977', 'learning_rate': '4.958e-05', 'epoch': '0.3855', 'num_input_tokens_seen': 31337523, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.73', 'grad_norm': '1.348', 'learning_rate': '4.958e-05', 'epoch': '0.3855', 'num_input_tokens_seen': 31339570, 'train_runtime': '1.586e+04', 'train_tokens_per_second': '1975'} +{'loss': '0.2283', 'grad_norm': '0.7681', 'learning_rate': '4.958e-05', 'epoch': '0.3855', 'num_input_tokens_seen': 31341617, 'train_runtime': '1.587e+04', 'train_tokens_per_second': '1975'} diff --git a/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c637a853f52f7636dbd8c269b6dee573b03cdf7 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.4.0 +fastapi==0.128.1 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.52.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.2 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a114f8e243249f1108a357c39b65a3e69327b497 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-05T02:37:25.915817Z", + "args": [ + "/workspace/v127rc_exp1/D_mul.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "a6086694d22a", + "executable": "/usr/bin/python", + "cpu_count": 24, + "cpu_count_logical": 48, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2604290048" + } + }, + "memory": { + "total": "269721972736" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-ff8ec606-2734-ef52-4257-850162397ce9" + } + ], + "cudaVersion": "12.7", + "writerId": "zh6rt3o374t2f5i8fr2iiq0hoyntbcfj" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e301b7d4eb9be0d29c4dabbe23b8efc1d1f1d4af --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-05T02:37:26.155502518Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"} +{"time":"2026-02-05T02:37:26.502201724Z","level":"INFO","msg":"stream: created new stream","id":"yz385gxb"} +{"time":"2026-02-05T02:37:26.506421573Z","level":"INFO","msg":"handler: started","stream_id":"yz385gxb"} +{"time":"2026-02-05T02:37:26.508247738Z","level":"INFO","msg":"stream: started","id":"yz385gxb"} +{"time":"2026-02-05T02:37:26.508259425Z","level":"INFO","msg":"writer: started","stream_id":"yz385gxb"} +{"time":"2026-02-05T02:37:26.508267638Z","level":"INFO","msg":"sender: started","stream_id":"yz385gxb"} diff --git a/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..11e934691de8601d91cf84a3554bc2512a6e5970 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log @@ -0,0 +1,23 @@ +2026-02-05 02:37:25,931 INFO MainThread:1076 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2 +2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_setup.py:_flush():81] Configure stats pid to 1076 +2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log +2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log +2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():844] calling init triggers +2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():892] starting backend +2026-02-05 02:37:26,147 INFO MainThread:1076 [wandb_init.py:init():895] sending inform_init request +2026-02-05 02:37:26,153 INFO MainThread:1076 [wandb_init.py:init():903] backend started and connected +2026-02-05 02:37:26,155 INFO MainThread:1076 [wandb_init.py:init():973] updated telemetry +2026-02-05 02:37:26,195 INFO MainThread:1076 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-05 02:37:26,815 INFO MainThread:1076 [wandb_init.py:init():1042] starting run threads in backend +2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_console_start():2529] atexit reg +2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-05 02:37:26,894 INFO MainThread:1076 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-05 02:37:26,896 INFO MainThread:1076 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-05 02:37:26,897 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['q_proj', 'o_proj', 'gate_proj', 'down_proj', 'k_proj', 'up_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_mul', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-05 02:37:26,902 INFO MainThread:1076 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-05 02:37:26,906 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-05 02:37:26,909 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t100_d0_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3887fde07b61c67a2f8efdd1e9c0dfd691a12f7f --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml @@ -0,0 +1,723 @@ +_name_or_path: + value: /workspace/Qwen/Qwen3-8B-Base +_wandb: + value: + cli_version: 0.24.2 + e: + be8ic28wchhzrbkqsu0bl7jl1lfwezfn: + args: + - /workspace/v127rc_exp1/E_mul.yaml + cpu_count: 24 + cpu_count_logical: 48 + cudaVersion: "12.7" + disk: + /: + total: "21474836480" + used: "2594168832" + email: markmochi200@gmail.com + executable: /usr/bin/python + git: + commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63 + remote: https://github.com/hiyouga/LlamaFactory.git + gpu: NVIDIA GeForce RTX 4090 + gpu_count: 1 + gpu_nvidia: + - architecture: Ada + cudaCores: 16384 + memoryTotal: "25757220864" + name: NVIDIA GeForce RTX 4090 + uuid: GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3 + host: 682d471c1c72 + memory: + total: "269721997312" + os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35 + program: /usr/local/bin/llamafactory-cli + python: CPython 3.11.10 + root: /workspace/LlamaFactory + startedAt: "2026-02-05T02:37:31.256607Z" + writerId: be8ic28wchhzrbkqsu0bl7jl1lfwezfn + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.11.10 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 84 + - 98 + - 105 + "3": + - 7 + - 19 + - 62 + - 66 + "4": 3.11.10 + "5": 0.24.2 + "6": 5.0.0 + "9": + "1": transformers_trainer + "12": 0.24.2 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.95 +adam_epsilon: + value: 1e-08 +architectures: + value: + - Qwen3ForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +data_args: + value: + buffer_size: 16384 + cutoff_len: 2047 + data_shared_file_system: false + dataset: + - Markie_Voss_t119_d0_r85 + dataset_dir: /workspace/LlamaFactory/data + default_system: null + enable_thinking: false + eval_dataset: null + eval_num_beams: null + eval_on_each_dataset: false + ignore_pad_token_for_loss: true + interleave_probs: null + mask_history: false + max_samples: 100000000 + media_dir: /workspace/LlamaFactory/data + mix_strategy: concat + neat_packing: false + overwrite_cache: false + packing: true + preprocessing_batch_size: 1000 + preprocessing_num_workers: 16 + streaming: false + template: qwen3_nothink + tokenized_path: null + tool_format: null + train_on_prompt: false + val_size: 0 +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 180000000 +debug: + value: [] +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: false +do_predict: + value: false +do_train: + value: true +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: 151645 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: null +eval_strategy: + value: "no" +eval_use_gather_object: + value: false +finetuning_args: + value: + additional_target: null + apollo_layerwise: false + apollo_proj: random + apollo_proj_type: std + apollo_rank: 16 + apollo_scale: 32 + apollo_scale_front: false + apollo_scale_type: channel + apollo_target: + - all + apollo_update_interval: 200 + badam_mask_mode: adjacent + badam_mode: layer + badam_start_block: null + badam_switch_interval: 50 + badam_switch_mode: ascending + badam_update_ratio: 0.05 + badam_verbose: 0 + compute_accuracy: false + create_new_adapter: false + disable_shuffling: false + dpo_label_smoothing: 0 + eaft_alpha: 1 + early_stopping_steps: null + finetuning_type: lora + freeze_extra_modules: null + freeze_language_model: false + freeze_multi_modal_projector: true + freeze_trainable_layers: 2 + freeze_trainable_modules: + - all + freeze_vision_tower: true + galore_layerwise: false + galore_proj_type: std + galore_rank: 16 + galore_scale: 2 + galore_target: + - all + galore_update_interval: 200 + include_effective_tokens_per_second: false + kto_chosen_weight: 1 + kto_rejected_weight: 1 + ld_alpha: null + lora_alpha: 32 + lora_dropout: 0.03 + lora_rank: 16 + lora_target: + - all + loraplus_lr_embedding: 1e-06 + loraplus_lr_ratio: null + module_dropout: 0 + oft_block_size: 32 + oft_rank: 0 + oft_target: + - all + pissa_convert: false + pissa_init: false + pissa_iter: 16 + plot_loss: true + ppo_buffer_size: 1 + ppo_epochs: 4 + ppo_score_norm: false + ppo_target: 6 + ppo_whiten_rewards: false + pref_bco_weight: 0 + pref_beta: 0.1 + pref_ftx: 0 + pref_loss: sigmoid + pure_bf16: false + ref_model: null + ref_model_adapters: null + ref_model_quantization_bit: null + reward_model: null + reward_model_adapters: null + reward_model_quantization_bit: null + reward_model_type: lora + simpo_gamma: 0.5 + stage: pt + swanlab_api_key: + swanlab_lark_secret: null + swanlab_lark_webhook_url: null + swanlab_logdir: null + swanlab_mode: cloud + swanlab_project: llamafactory + swanlab_run_name: null + swanlab_workspace: null + use_adam_mini: false + use_apollo: false + use_badam: false + use_dft_loss: false + use_dora: false + use_eaft_loss: false + use_galore: false + use_llama_pro: false + use_mca: false + use_muon: false + use_rslora: false + use_swanlab: false +fp8: + value: false +fp8_backend: + value: auto +fp8_enable_fsdp_float8_all_gather: + value: false +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +generating_args: + value: + do_sample: true + length_penalty: 1 + max_new_tokens: 1024 + num_beams: 1 + repetition_penalty: 1 + skip_special_tokens: true + temperature: 0.95 + top_k: 50 + top_p: 0.7 +generation_config: + value: null +generation_max_length: + value: 2047 +generation_num_beams: + value: null +gradient_accumulation_steps: + value: 1 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 4096 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: all +initializer_range: + value: 0.02 +intermediate_size: + value: 12288 +is_encoder_decoder: + value: false +label_names: + value: + - labels +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +layer_types: + value: + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention + - full_attention +learning_rate: + value: 5e-05 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: false +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +master_addr: + value: null +master_port: + value: null +max_grad_norm: + value: 1 +max_position_embeddings: + value: 32768 +max_steps: + value: -1 +max_window_layers: + value: 36 +metric_for_best_model: + value: null +model/num_parameters: + value: 8234382336 +model_args: + value: + adapter_folder: null + adapter_name_or_path: null + add_special_tokens: null + add_tokens: null + audio_sampling_rate: 16000 + block_diag_attn: false + cache_dir: null + chunk_size: 8192 + compute_dtype: torch.bfloat16 + cpu_infer: 32 + crop_to_patches: false + device_map: + "": cuda:0 + disable_gradient_checkpointing: false + double_quantization: true + enable_liger_kernel: false + export_device: cpu + export_dir: null + export_hub_model_id: null + export_legacy_format: false + export_quantization_bit: null + export_quantization_dataset: null + export_quantization_maxlen: 1024 + export_quantization_nsamples: 128 + export_size: 5 + flash_attn: auto + hf_hub_token: + image_do_pan_and_scan: false + image_max_pixels: 589824 + image_min_pixels: 1024 + infer_backend: HF + infer_dtype: auto + init_special_tokens: noise_init + kt_force_think: false + kt_maxlen: 4096 + kt_mode: normal + kt_optimize_rule: null + kt_use_cuda_graph: true + low_cpu_mem_usage: true + mixture_of_depths: null + mode: normal + model_max_length: 2047 + model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + model_revision: main + moe_aux_loss_coef: null + ms_hub_token: + new_special_tokens_config: null + offload_folder: offload + om_hub_token: + print_param_status: false + quantization_bit: null + quantization_device_map: null + quantization_method: BNB + quantization_type: nf4 + resize_vocab: false + rope_scaling: null + sglang_config: null + sglang_lora_backend: triton + sglang_maxlen: 4096 + sglang_mem_fraction: 0.7 + sglang_tp_size: -1 + shift_attn: false + split_special_tokens: false + train_from_scratch: false + trust_remote_code: true + upcast_layernorm: false + upcast_lmhead_output: false + use_audio_in_video: false + use_fast_tokenizer: true + use_kt: false + use_kv_cache: true + use_reentrant_gc: true + use_unsloth: false + use_unsloth_gc: false + use_v1_kernels: false + video_fps: 2 + video_max_pixels: 65536 + video_maxlen: 128 + video_min_pixels: 256 + vllm_config: null + vllm_enforce_eager: false + vllm_gpu_util: 0.7 + vllm_max_lora_rank: 32 + vllm_maxlen: 4096 +model_type: + value: qwen3 +neftune_noise_alpha: + value: null +num_attention_heads: + value: 32 +num_hidden_layers: + value: 36 +num_key_value_heads: + value: 8 +num_train_epochs: + value: 5 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /workspace/v127rc_exp1/E_mul +output_hidden_states: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: 151643 +parallelism_config: + value: null +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.03 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.1 + qalora_group_size: 16 + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - v_proj + - gate_proj + - o_proj + - up_proj + - k_proj + - down_proj + - q_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 1 +predict_with_generate: + value: false +prediction_loss_only: + value: false +problem_type: + value: null +project: + value: huggingface +push_to_hub: + value: false +ray_init_kwargs: + value: null +ray_num_workers: + value: 1 +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +rms_norm_eps: + value: 1e-06 +rope_parameters: + value: + rope_theta: 1000000 + rope_type: default +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: true +save_steps: + value: 1000 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +skip_memory_metrics: + value: true +sliding_window: + value: null +sortish_sampler: + value: false +tf32: + value: null +tie_word_embeddings: + value: false +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +transformers_version: + value: 5.0.0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +use_sliding_window: + value: false +vocab_size: + value: 151936 +warmup_ratio: + value: 0.02 +warmup_steps: + value: 0.02 +weight_decay: + value: 0 diff --git a/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c637a853f52f7636dbd8c269b6dee573b03cdf7 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.4.0 +fastapi==0.128.1 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.52.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.2 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..11d8ae227484998b4f19f3669f0f85efadcf0dd2 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-05T02:37:31.256607Z", + "args": [ + "/workspace/v127rc_exp1/E_mul.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "682d471c1c72", + "executable": "/usr/bin/python", + "cpu_count": 24, + "cpu_count_logical": 48, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2594168832" + } + }, + "memory": { + "total": "269721997312" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3" + } + ], + "cudaVersion": "12.7", + "writerId": "be8ic28wchhzrbkqsu0bl7jl1lfwezfn" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..3efc6605b77b199da0dfa5fc6a7d817440114047 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json @@ -0,0 +1 @@ +{"train_runtime":202598.5168,"train_samples_per_second":0.963,"_timestamp":1.770461649358481e+09,"_step":195010,"train/train_tokens_per_second":1970.359,"train/loss":0.7374985218048096,"train/grad_norm":2.825721025466919,"train/global_step":195010,"_runtime":202601,"_wandb":{"runtime":202601},"train/epoch":5,"total_flos":1.8231724481360794e+19,"train/learning_rate":3.3779062880157087e-15,"train_loss":0.3935867749506399,"train_steps_per_second":0.963,"train/num_input_tokens_seen":399185470} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug-internal.log b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..768ebf98facf9592c717df8868aeb64eea94c4a4 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug-internal.log @@ -0,0 +1,13 @@ +{"time":"2026-02-05T02:37:31.49899511Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"} +{"time":"2026-02-05T02:37:31.852360923Z","level":"INFO","msg":"stream: created new stream","id":"1lb2e6m1"} +{"time":"2026-02-05T02:37:31.852871538Z","level":"INFO","msg":"handler: started","stream_id":"1lb2e6m1"} +{"time":"2026-02-05T02:37:31.853911398Z","level":"INFO","msg":"stream: started","id":"1lb2e6m1"} +{"time":"2026-02-05T02:37:31.853930477Z","level":"INFO","msg":"writer: started","stream_id":"1lb2e6m1"} +{"time":"2026-02-05T02:37:31.853932159Z","level":"INFO","msg":"sender: started","stream_id":"1lb2e6m1"} +{"time":"2026-02-06T18:59:32.437758847Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/1lb2e6m1/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-06T21:37:35.657916123Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/1lb2e6m1/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-07T10:54:13.533366867Z","level":"INFO","msg":"stream: closing","id":"1lb2e6m1"} +{"time":"2026-02-07T10:54:16.209457181Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-07T10:54:16.437506486Z","level":"INFO","msg":"handler: closed","stream_id":"1lb2e6m1"} +{"time":"2026-02-07T10:54:16.44112648Z","level":"INFO","msg":"sender: closed","stream_id":"1lb2e6m1"} +{"time":"2026-02-07T10:54:16.441949856Z","level":"INFO","msg":"stream: closed","id":"1lb2e6m1"} diff --git a/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug.log b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d6f69862bb8351723ffd346f938fd993847b0789 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug.log @@ -0,0 +1,25 @@ +2026-02-05 02:37:31,274 INFO MainThread:1929 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2 +2026-02-05 02:37:31,274 INFO MainThread:1929 [wandb_setup.py:_flush():81] Configure stats pid to 1929 +2026-02-05 02:37:31,274 INFO MainThread:1929 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-05 02:37:31,275 INFO MainThread:1929 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug.log +2026-02-05 02:37:31,276 INFO MainThread:1929 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/logs/debug-internal.log +2026-02-05 02:37:31,276 INFO MainThread:1929 [wandb_init.py:init():844] calling init triggers +2026-02-05 02:37:31,276 INFO MainThread:1929 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-05 02:37:31,277 INFO MainThread:1929 [wandb_init.py:init():892] starting backend +2026-02-05 02:37:31,490 INFO MainThread:1929 [wandb_init.py:init():895] sending inform_init request +2026-02-05 02:37:31,496 INFO MainThread:1929 [wandb_init.py:init():903] backend started and connected +2026-02-05 02:37:31,498 INFO MainThread:1929 [wandb_init.py:init():973] updated telemetry +2026-02-05 02:37:31,558 INFO MainThread:1929 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-05 02:37:32,213 INFO MainThread:1929 [wandb_init.py:init():1042] starting run threads in backend +2026-02-05 02:37:32,286 INFO MainThread:1929 [wandb_run.py:_console_start():2529] atexit reg +2026-02-05 02:37:32,286 INFO MainThread:1929 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-05 02:37:32,287 INFO MainThread:1929 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-05 02:37:32,287 INFO MainThread:1929 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-05 02:37:32,289 INFO MainThread:1929 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-05 02:37:32,290 INFO MainThread:1929 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['v_proj', 'gate_proj', 'o_proj', 'up_proj', 'k_proj', 'down_proj', 'q_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/E_mul', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-05 02:37:32,296 INFO MainThread:1929 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-05 02:37:32,296 INFO MainThread:1929 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-05 02:37:32,298 INFO MainThread:1929 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t119_d0_r85'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2026-02-07 10:54:13,533 INFO wandb-AsyncioManager-main:1929 [service_client.py:_forward_responses():94] Reached EOF. +2026-02-07 10:54:13,534 INFO wandb-AsyncioManager-main:1929 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles. diff --git a/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/files/requirements.txt b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c637a853f52f7636dbd8c269b6dee573b03cdf7 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/files/requirements.txt @@ -0,0 +1,257 @@ +pytz==2025.2 +pydub==0.25.1 +brotli==1.2.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.6.0 +websockets==15.0.1 +tzdata==2025.3 +typing_extensions==4.15.0 +tqdm==4.67.3 +tomlkit==0.13.3 +termcolor==3.3.0 +shtab==1.8.0 +shellingham==1.5.4 +sentencepiece==0.2.1 +semantic-version==2.10.0 +safetensors==0.7.0 +ruff==0.15.0 +regex==2026.1.15 +python-multipart==0.0.22 +pyparsing==3.3.2 +pyarrow==23.0.0 +protobuf==6.33.5 +propcache==0.4.1 +orjson==3.11.7 +omegaconf==2.3.0 +numpy==2.4.2 +multidict==6.7.1 +mdurl==0.1.2 +kiwisolver==1.4.9 +hf-xet==1.2.0 +hf_transfer==0.1.9 +groovy==0.1.2 +frozenlist==1.8.0 +fonttools==4.61.1 +ffmpy==1.0.0 +einops==0.8.2 +docstring_parser==0.17.0 +dill==0.3.8 +cycler==0.12.1 +click==8.3.1 +av==16.0.0 +annotated-types==0.7.0 +annotated-doc==0.0.4 +aiohappyeyeballs==2.6.1 +aiofiles==24.1.0 +yarl==1.22.0 +uvicorn==0.40.0 +typing-inspection==0.4.2 +typer-slim==0.21.1 +tiktoken==0.12.0 +scipy==1.17.0 +pydantic_core==2.41.4 +pandas==2.3.3 +multiprocess==0.70.16 +modelscope==1.34.0 +markdown-it-py==4.0.0 +fire==0.7.1 +contourpy==1.3.3 +anyio==4.12.1 +aiosignal==1.4.0 +starlette==0.50.0 +rich==14.3.2 +pydantic==2.12.3 +matplotlib==3.10.8 +aiohttp==3.13.3 +tyro==0.8.14 +typer==0.21.1 +torchdata==0.11.0 +sse-starlette==3.2.0 +safehttpx==0.1.7 +huggingface_hub==1.4.0 +fastapi==0.128.1 +tokenizers==0.22.2 +gradio_client==1.14.0 +datasets==4.0.0 +accelerate==1.11.0 +transformers==5.0.0 +gradio==5.50.0 +trl==0.24.0 +peft==0.18.1 +llamafactory==0.9.5.dev0 +jieba==0.42.1 +rouge-chinese==1.0.3 +joblib==1.5.3 +nltk==3.9.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==13.590.48 +hjson==3.1.0 +ninja==1.13.0 +msgpack==1.1.2 +deepspeed==0.16.9 +smmap==5.0.2 +sentry-sdk==2.52.0 +gitdb==4.0.12 +GitPython==3.1.46 +wandb==0.24.2 +entrypoints==0.4 +jupyter_client==7.4.9 +nbclassic==1.1.0 +notebook==6.5.5 +pyzmq==24.0.1 +PyYAML==6.0.2 +Send2Trash==1.8.3 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.1.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.3.2 +comm==0.2.2 +debugpy==1.8.5 +decorator==5.1.1 +defusedxml==0.7.1 +executing==2.1.0 +fastjsonschema==2.20.0 +fqdn==1.5.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.27.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.1 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +jupyter-archive==3.4.0 +jupyter_contrib_core==0.4.2 +jupyter_contrib_nbextensions==0.7.0 +jupyter_core==5.7.2 +jupyter-events==0.10.0 +jupyter-highlight-selected-word==0.2.0 +jupyter-lsp==2.2.5 +jupyter_nbextensions_configurator==0.6.4 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +lxml==5.3.0 +matplotlib-inline==0.1.7 +mistune==3.0.2 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook_shim==0.2.4 +overrides==7.7.0 +packaging==24.1 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.47 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.20.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +terminado==0.18.1 +tinycss2==1.3.0 +tornado==6.4.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20240906 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 +widgetsnbextension==4.0.13 +Jinja2==3.1.3 +MarkupSafe==2.1.5 +filelock==3.13.1 +fsspec==2024.2.0 +mpmath==1.3.0 +networkx==3.2.1 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +pillow==10.2.0 +sympy==1.12 +torch==2.4.1+cu124 +torchaudio==2.4.1+cu124 +torchvision==0.19.1+cu124 +triton==3.0.0 +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +SecretStorage==3.3.1 +blinker==1.4 +cryptography==3.4.8 +dbus-python==1.2.18 +distro==1.7.0 +httplib2==0.20.2 +importlib-metadata==4.6.4 +jeepney==0.7.1 +keyring==23.5.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +more-itertools==8.10.0 +oauthlib==3.2.0 +python-apt==2.4.0+ubuntu4 +six==1.16.0 +wadllib==1.3.6 +zipp==1.0.0 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +inflect==7.3.1 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +jaraco.text==3.12.1 +more-itertools==10.3.0 +packaging==24.1 +platformdirs==4.2.2 +tomli==2.0.1 +typeguard==4.3.0 +typing_extensions==4.12.2 +wheel==0.43.0 +zipp==3.19.2 diff --git a/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/files/wandb-metadata.json b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9537a7da9ff593f8353c5429f55de55c780ae106 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/files/wandb-metadata.json @@ -0,0 +1,41 @@ +{ + "os": "Linux-6.8.0-90-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.10", + "startedAt": "2026-02-05T02:37:38.691775Z", + "args": [ + "/workspace/v127rc_exp1/C_mul.yaml" + ], + "program": "/usr/local/bin/llamafactory-cli", + "git": { + "remote": "https://github.com/hiyouga/LlamaFactory.git", + "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63" + }, + "email": "markmochi200@gmail.com", + "root": "/workspace/LlamaFactory", + "host": "1a988b47540d", + "executable": "/usr/bin/python", + "cpu_count": 24, + "cpu_count_logical": 48, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "21474836480", + "used": "2595487744" + } + }, + "memory": { + "total": "270083489792" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada", + "uuid": "GPU-85954e97-6854-f801-5620-71763b06e144" + } + ], + "cudaVersion": "12.8", + "writerId": "3kvhndmvw04ba4cffvg8ddplta1rx8pe" +} \ No newline at end of file diff --git a/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug-internal.log b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..58b95e96a008a06eb36e01efeec61057f24159a5 --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-02-05T02:37:38.957345427Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"} +{"time":"2026-02-05T02:37:39.310942049Z","level":"INFO","msg":"stream: created new stream","id":"7rn01zb3"} +{"time":"2026-02-05T02:37:39.311891011Z","level":"INFO","msg":"handler: started","stream_id":"7rn01zb3"} +{"time":"2026-02-05T02:37:39.313946712Z","level":"INFO","msg":"stream: started","id":"7rn01zb3"} +{"time":"2026-02-05T02:37:39.313977703Z","level":"INFO","msg":"sender: started","stream_id":"7rn01zb3"} +{"time":"2026-02-05T02:37:39.314020334Z","level":"INFO","msg":"writer: started","stream_id":"7rn01zb3"} diff --git a/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug.log b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7ee9465f38fd41692d09cbc1521bf21e0681d6ef --- /dev/null +++ b/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug.log @@ -0,0 +1,23 @@ +2026-02-05 02:37:38,724 INFO MainThread:1392 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2 +2026-02-05 02:37:38,725 INFO MainThread:1392 [wandb_setup.py:_flush():81] Configure stats pid to 1392 +2026-02-05 02:37:38,725 INFO MainThread:1392 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-05 02:37:38,726 INFO MainThread:1392 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug.log +2026-02-05 02:37:38,727 INFO MainThread:1392 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260205_023738-7rn01zb3/logs/debug-internal.log +2026-02-05 02:37:38,727 INFO MainThread:1392 [wandb_init.py:init():844] calling init triggers +2026-02-05 02:37:38,728 INFO MainThread:1392 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-02-05 02:37:38,728 INFO MainThread:1392 [wandb_init.py:init():892] starting backend +2026-02-05 02:37:38,946 INFO MainThread:1392 [wandb_init.py:init():895] sending inform_init request +2026-02-05 02:37:38,954 INFO MainThread:1392 [wandb_init.py:init():903] backend started and connected +2026-02-05 02:37:38,956 INFO MainThread:1392 [wandb_init.py:init():973] updated telemetry +2026-02-05 02:37:39,020 INFO MainThread:1392 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-05 02:37:39,707 INFO MainThread:1392 [wandb_init.py:init():1042] starting run threads in backend +2026-02-05 02:37:39,879 INFO MainThread:1392 [wandb_run.py:_console_start():2529] atexit reg +2026-02-05 02:37:39,879 INFO MainThread:1392 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-05 02:37:39,880 INFO MainThread:1392 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-05 02:37:39,880 INFO MainThread:1392 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-05 02:37:39,882 INFO MainThread:1392 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-05 02:37:39,884 INFO MainThread:1392 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['v_proj', 'k_proj', 'up_proj', 'gate_proj', 'o_proj', 'q_proj', 'down_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C_mul', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False} +2026-02-05 02:37:39,892 INFO MainThread:1392 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - > +2026-02-05 02:37:39,892 INFO MainThread:1392 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None +2026-02-05 02:37:39,895 INFO MainThread:1392 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t70_d0_r143'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/v127rc_exp2/B_mul/checkpoint-10000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b07abbbd93ee7b3e32a89310e065917ed1df311c --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500fde0ae0a3e47fe96ea40790b427d5d8eb1cf3a5130d55ea12007b679cb4ea +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b37533afb96a73e11f21aba90e1b32b5168f427 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db8fef73e10acb47bcd956089e7b2149237df66b815cf70bbbf2d316f5e72641 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e161229b9c5a1ec9504e0d752ba580e8682239cf --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bba0a794476e27d32f60804acbbccae10a8c44ae6a8e72115c47996e03e06b7c +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7a89297e01077481ff6d59069eacc8b8440e7d13 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d7f35cbd33281c9e051326c03721236d3088bacdb5c790c7d12e62e775c850 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee47406b40f9d4950a0e836e6d156266ad03ce35 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f13187847413d1e5e4c47c2d67deaa794d19e2edb64036d8e4c6154f406a145 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..917a216f96862fd975d6423adc0a2afa5cde786a --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:797d954f1d4aeb6a297e2406ae35c67773f55ad4c5c3b6009d6a691173a4ff26 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f333e6b2e59bc7ea7ba4c3a8f7c1c476da6cac61 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9228abb06892518a82d7f3f7ecc1c5be0f91ccd9e6322e59ca9607e34f1db537 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-10700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-10700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff725008ed92d1e759e89645c21c80632ad3c9c5 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-10700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3431bfe228289e23c79a6d2ba4db586605384f1284342f20e9a8bfcd02b5c5d6 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-11000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-11000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..010560dd9595aefb44dfc89ca04b2e0ee8b01e6c --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-11000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cc0c3cf4cabdcf1d13271a0cd52715cc733cad84611b89fc0e4c5c8cc3c54da +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-2900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-2900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-2900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3200/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3300/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3400/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3500/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-3600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e32bf95fd96fd92ddc3fbcb717f5a75e7a45dcec --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75793b946f58047dc36022b1828b1601e47ac719400de1c3e15dd363ed8a49de +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3600/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3700/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-3800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..da495bf3f4394c47e1fc469878b9343a497af062 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7801499e37854b842c7da6ff34f8c262f3be66d4a1101e2b9d8ad268c6065f6c +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3800/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-3900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-3900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d2644c26f7e2551466fca86f14a1825d1911d6a8 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c304f7e2feadf7e43d3ecfc2111e0fd07e08041b147d268f4ea8f4eaccdf5d7 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-3900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-3900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-3900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..075eefc4b40019af1f6f5646974d752cb52ca005 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ae7fe5ecdc72c0f7a36671d475866c233b460b83f7fc6a0f5ebcebeb565cd2 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1a3a6b26e889dbdc82b6043f3a3f4c387a9aa0cb --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb1493e580dbbc4a667c7aff6465ae3779ce622dab3992ddf558cbcf81ebe77 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ad39a3a64f87b6fcf54c320f22d3feac661370d0 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc38b9ed8e2dee8815e2bdcbb1252bf35685af607e74298715c22dea976934c +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4200/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d9010699d404394a7a57e97ea7dd008a9173cf7e --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf2d1559942a59e2a649423caa03b46df56daaf51429573db5bd2b26941321d +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4300/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0dd2fe09d84d048f035899cdb0f6fc4980e8767f --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9ead15110b2efd06b7fddaf63ee50fe460f08f625e1e4b1ebef21932b1c675 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4400/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f40b31ad429bd349e80adf07bd3a8001d724fff2 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8318aa47affc1c3318f9aae48ef06d4118d417d512642f6f1d1b2bbc5b621e95 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4500/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fe167324b89240d908afa65b59bcc735d3075323 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:317838bd32f89089a0525bb82aecaab447c8d91eb512813de5350838ded945c6 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4600/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7226ee18f9d698427a37123e75d28164979bce2e --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2edd6ac0b92acefd41b8fc441d93ca5c946bcf63e76fbb05fc6421267f537abd +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4700/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9961a2274c203ad447a0f4f9841163c34a3f86a7 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da0e1304a8b10328fbfd1bcf061d95aa9fa0e75006ad35aab0eb8b1a6abbb115 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4800/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-4900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-4900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..88e2e7247af8e880d63f260fe5b54ed98bb7a348 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db853b84710b3a5cc1cb131be343d318b21a326ae994a33e992e73568ae8946 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-4900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-4900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-4900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4e0d454e959d99c94ccd8e69767ca6f311fa7d66 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68c4dfc2bf4729593f99fc15b9dc85c4662f75933cdffbb6ad3bf030676e7c67 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..16852c8eff1a707c8b6e683cd5f022ecf855bbc0 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0e9ca33471e67d4358cb4db55622be36be082b886abec83f7468453a7b870b +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d5705ce6b581c4fb1eecb0ee69f1f8207e9fc07 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e6117613512d0eb1430fd70f8cf47eedf0951640b199f9a1c08e6735013e3b +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5200/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9aa188990097f626cc09d89a93d39f96c57bf0d2 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9acc6e48975b2f07f7f285acc718ad731a6f7693d01c194b09b1e31bb2b8e606 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5300/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9bad586a6d700513d308d6a3fd12e199fe8cc19c --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e4d88e25974a43f8ab326cfb3990bc4021bf5d95a2c2cd87425859bbe2f594 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5400/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1ee021dffcd311d8989c17a3373884c319d64d27 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e8b5f33f116e75714b20879f8c62fb69fd9e7f4a04340f012c9a163f9d9e373 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5500/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce8477e9efbb2163d26a1d2d978d44005306056c --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4d3be2dd4f6019a28a1e235e10fb7efc330d854eab2affcc0457d8524c5ed7c +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5600/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ed40b1fd44e8c8bdd0baac21ac09c330a578daa8 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7deb52e86ca029cd3d3566a53d28b158ddfa2cb450e8b8674cb6b629fd0ab86 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5700/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..457c0531a6b4b3b18dcbc21864136458ae77bc36 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c896a403e588849223b593a7f32bbaec7a7a054a2bf4a668a7a49e273f6ba8f0 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5800/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-5900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-5900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7be905d25cd0dd310379d8c17058cbec6f83fa21 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f936429e98ef0ceab81a49d891e26bda04a14e4d4e1832e0e09f85f022ce613 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-5900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-5900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-5900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7a65712b44a08760c1a36bcaa2da044d69eb5485 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2353e4f64511311f0e6982ddd7d0ccba8f4937a048e44e3ad473ff77b4b05f6a +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..73d08fe67e448ec52db9d5c85058d48866e4f71d --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbace070ab02464f41967b1fa4814c6864339fc2c624eb30a86e0e2aece2059f +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23c83611c52000279e1a44b884833af973e5ff41 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb4c0c42da4b212efdbff95075a6c43ae514786bc64bd170c01248825bfb5b43 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6200/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..872695f987011d6bd5907138c15eeff0cbfa4808 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe52218cdd505ea506b5b541cd98f95401c2e5f3b6b3914ea08a53041fbc365 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6300/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..49ff4aa2ed7b3e249843096519b1b7ac27a15716 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5190e595355e7cc43914df2eb74cd040b1423c79da744e52308ac0b7cbba0462 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6400/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bdea89f184ff719d7afb49eb383d7f514a4bb7ab --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf87ae2cf1361da477b2b9a120b42566bd3bba9724487fc82bc0fdcafd97b642 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6500/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c3840a501154680bcbe1b3e5a8ef6e2ec70d236c --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64133d8c914711103a939b73f154412c2331b2b7209fb2a2df728079c7da779a +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6600/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a005320b85d4b0aafdff92fd6d286421c7ed956 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4cc908e0b60e02958b0bde5dea552f5c6581a2f804131ca54a92567965ccd19 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6700/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1466777cb8abefdecf87a399dd1ab484535c59d1 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf340221bc2d7c1fa01e2fb01064347533c9cee8f3adcc17eb8e7bc009cd8c5f +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6800/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-6900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-6900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e819b7655fa851d57ffafa647dde93a904c1d1ef --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b2845c7f707844999b49daea634d4c38677d5666adc593c46c408e498e311d +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-6900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-6900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-6900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..63d0c1f8051902e83e1aed1b8fea34de2b21164b --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:101ee358e5f8daf4d321ec6af05b98567f194b98d408925e09b1a51970a96916 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..503fe308e5918fa58bccf94256bc6dee6bb9f721 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b63dc3deb6df03d037f46231bd7facd91184b596934ee825f2007af6bbe7ee6 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..81a22d4aaa1994bf7da2647b0f8bb1e413b2d3c4 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17e3c84fa11eb1d8dff79c78db3e8532d68255b72cda6ffc99e334e42c21c6c +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7200/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7cd4bb907803fef8faec881c6e5bd437f2d5722c --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7a0bdebd33ab2a6d0daeee7b27786d8c8619bdf78122b5fd320eacb437f1b5 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7300/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..db768e12d23d531ea8254e9500c99b074e58f4b6 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6c5d79a29fe5da375c41f83355bda4cfc9939c86c0bcdade92181e45e5b16b4 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7400/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e555770a61f9401905c0b350c541e30b4dc01a15 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b29f3cf3b3f686ee2ab87fa3bf1e3259e2652bf559734ff3514ed010abde7f0e +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7500/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..24768d1d7512f8865825b2993ea709d52bbda640 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fabb7a5d3e2fe06eeed38320b1d4a84e3ab791a834428b71f98624db9186231 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7600/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff0b65bf709c0d5966dc299a58a7820f80b529e9 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece771090174338c23e4b9091ad4421e0e2a06d0afc144429f7f38b1c4b4fe86 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7700/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c0b71b1e4d488f70eadce270c96b2225213b6c99 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbae640d0d1efe1f7b138bd5aba156d9aa343ad44983cdff72f97b5310dae14f +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7800/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-7900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-7900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ea612ea524a9fbacea648cff8ede12921134b18 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624d1103251e8ac34cfbee2173c4f753c54960c0af1ccd9d7925e032bd1a9cb1 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-7900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-7900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-7900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f4e2487b0bcd27349bbb83c8bfcfc4d483704adb --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e610bcd58bf40a7560c07a45e308ad4feade5b1883e6a67f296d901daaa5effe +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..997a426b34e3392a41f65c49b7791c43924cbb7e --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acdac2d1682baa4475a991917a31e41eb9f1d9b9d295479eace894a4c9063c14 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d3a0f013b3bc9ca00f333c84be32c9b05fa4e929 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e357d057715a7cdab345ab8bb1b0102ab41730e44f52aa211924065a8c0df6 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8200/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..abc52f6c83e6f7d024d831a043f8a89122b69d34 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27dfeccc2533c60ee8db15cc669a18a2916bfd30a7b6b89662a41fa66e1507d9 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8300/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e97173c8d7db5c1ef743eb62e03b709055d6000b --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f70975e8d431987ae5cfd17b02051929dba959099ee15129c593f8ca51ffc363 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8400/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec8624140d42de2510d96ab0f50c1745488064db --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa89a84a9d6bb83be9c44f26536e5c7a0665339a894bb77178fe547669fd193d +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8500/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9778a7bc04135d45ffa7dedbccdd78d17360fe33 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a25031d063e2c0c71b40498cc4513b60bb1c53ff3e44dcbba86bfcd19b3c8ed +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8600/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a4d1993bcdb6c2d6ab7181a0ae2768cb98f5991 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7f2c775d46918a96f3797fa9c92ab5d56d7a091bf254afe9d6e6ec20716308 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8700/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3aed9be92b05e3aca554f2a3c17b16670f5460e --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a113aac9ff68839d4f8417b65b2219acc5d94d20f40a8058e6515995dece2d +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8800/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-8900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-8900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..947bec2ee161f3d3cd6e000ddebd67bf8ef57d11 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db435beab7382fdd836b7d33de080cef31a10ebcb284cf15310a798b41d22c90 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-8900/training_args.bin b/v127rc_exp2/B_mul/checkpoint-8900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-8900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-9000/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..60d79983812f0125e62a2ea69bf03c6512b5300f --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107e0f422816eb13314d7c5af0800da80d32a05cda42323e164c22fd71506a0d +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-9000/training_args.bin b/v127rc_exp2/B_mul/checkpoint-9000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-9100/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec9b478aae801229b78caabf9b4538edead8cb2e --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18bb172ac600f02223e97123a9b3411daf87dabc9571e109e26b967b197e2d18 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json b/v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9690ff78b03614a8bb7931635280dd0f90696434 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47cee88ab6192c51196be5e721997b068132bbdfd85cec4b9e3ca3187cfca473 +size 11422648 diff --git a/v127rc_exp2/B_mul/checkpoint-9100/training_args.bin b/v127rc_exp2/B_mul/checkpoint-9100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..82f83133e67b4f57b52a7a178ce5c79a4cfe3499 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47965d12b5088632fd2e16ac8f7250027a965bf0fd4e5e4a6d1df722403e8425 +size 5176 diff --git a/v127rc_exp2/B_mul/checkpoint-9200/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..163651eee1ed797c6d66c9c360702725571ae891 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ace8c8f250eed3526f91113ea3777670a9dee642a4a331e35121ec62506e2d +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9300/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2fa2cb5a7ce670c43952b9408e0070085133e98b --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb2c09baaa0f46de9a0c8e9ea02c7e8610c861eb6a488b0a78342ea53fb7566 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9400/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5f85b58c36d1a2de0154f72658bbb53270973ea --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bd62cf7b3f1020f54a02470ad985ad0fdddbb5c2917570e8e2c077e2bc64cb6 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9500/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3977f378b19242faef160e78316e1af5db041147 --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dbe9c020eafa23db4e997d57077ac8cd567f5ce379ff63052c2b8441c57861e +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9600/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c0fb671edc2c43913fc78e52193c551b2efa46ce --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7566d943de6236c168c35f6aa795569e321ce7ae8ebdcc41e8f72159ff953cfe +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9700/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..df8549ca35365b9d8feec49b94c3b52bede5255b --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b1609da35807f035e7bcfd9c209c67c8aac3b466c6cf36e1dbacf81ac93a8a9 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9800/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba4a527e7d388ac5c803ef8a0bb5aa6ffef8e55d --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a749a6ae44b97a9125ed6b058fbb36a4c657322e025ca3f0410ebb6fdf79d1 +size 349243752 diff --git a/v127rc_exp2/B_mul/checkpoint-9900/adapter_model.safetensors b/v127rc_exp2/B_mul/checkpoint-9900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c6049576584fbe8106c277da6753c2e8daf3005d --- /dev/null +++ b/v127rc_exp2/B_mul/checkpoint-9900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b89d212600f2a1ee35bb00af61717e9ee0d5b1a62fdc31e25c0bb4123f020d +size 349243752